feat: add support for audio content in Message model and update README

closes #2054
2025-11-09 18:23:40 +08:00 · 2025-02-06 13:12:35 +00:00
parent 790e1e9f5b
commit 0103e980fc
2 changed files with 61 additions and 9 deletions
--- a/relay/model/message.go
+++ b/relay/model/message.go
@@ -1,11 +1,36 @@
 package model
 import (
 	"context"
 	"github.com/songquanpeng/one-api/common/logger"
 )
 type Message struct {
-	Role       string  `json:"role,omitempty"`
+	Role string `json:"role,omitempty"`
-	Content    any     `json:"content,omitempty"`
+	// Content is a string or a list of objects
-	Name       *string `json:"name,omitempty"`
+	Content    any           `json:"content,omitempty"`
-	ToolCalls  []Tool  `json:"tool_calls,omitempty"`
+	Name       *string       `json:"name,omitempty"`
-	ToolCallId string  `json:"tool_call_id,omitempty"`
+	ToolCalls  []Tool        `json:"tool_calls,omitempty"`
 	ToolCallId string        `json:"tool_call_id,omitempty"`
 	Audio      *messageAudio `json:"audio,omitempty"`
 	// -------------------------------------
 	// Deepseek 专有的一些字段
 	// https://api-docs.deepseek.com/api/create-chat-completion
 	// -------------------------------------
 	// Prefix forces the model to begin its answer with the supplied prefix in the assistant message.
 	// To enable this feature, set base_url to "https://api.deepseek.com/beta".
 	Prefix *bool `json:"prefix,omitempty"` // ReasoningContent is Used for the deepseek-reasoner model in the Chat
 	// Prefix Completion feature as the input for the CoT in the last assistant message.
 	// When using this feature, the prefix parameter must be set to true.
 	ReasoningContent *string `json:"reasoning_content,omitempty"`
 }
 type messageAudio struct {
 	Id         string `json:"id"`
 	Data       string `json:"data,omitempty"`
 	ExpiredAt  int    `json:"expired_at,omitempty"`
 	Transcript string `json:"transcript,omitempty"`
 }
 func (m Message) IsStringContent() bool {
@@ -26,6 +51,7 @@ func (m Message) StringContent() string {
 			if !ok {
 				continue
 			}
 			if contentMap["type"] == ContentTypeText {
 				if subStr, ok := contentMap["text"].(string); ok {
 					contentStr += subStr
@@ -34,6 +60,7 @@ func (m Message) StringContent() string {
 		}
 		return contentStr
 	}
 	return ""
 }
@@ -47,6 +74,7 @@ func (m Message) ParseContent() []MessageContent {
 		})
 		return contentList
 	}
 	anyList, ok := m.Content.([]any)
 	if ok {
 		for _, contentItem := range anyList {
@@ -71,8 +99,21 @@ func (m Message) ParseContent() []MessageContent {
 						},
 					})
 				}
 			case ContentTypeInputAudio:
 				if subObj, ok := contentMap["input_audio"].(map[string]any); ok {
 					contentList = append(contentList, MessageContent{
 						Type: ContentTypeInputAudio,
 						InputAudio: &InputAudio{
 							Data:   subObj["data"].(string),
 							Format: subObj["format"].(string),
 						},
 					})
 				}
 			default:
 				logger.Warnf(context.TODO(), "unknown content type: %s", contentMap["type"])
 			}
 		}
 		return contentList
 	}
 	return nil
@@ -84,7 +125,18 @@ type ImageURL struct {
 }
 type MessageContent struct {
-	Type     string    `json:"type,omitempty"`
+	// Type should be one of the following: text/input_audio
-	Text     string    `json:"text"`
+	Type       string      `json:"type,omitempty"`
-	ImageURL *ImageURL `json:"image_url,omitempty"`
+	Text       string      `json:"text"`
 	ImageURL   *ImageURL   `json:"image_url,omitempty"`
 	InputAudio *InputAudio `json:"input_audio,omitempty"`
 }
 type InputAudio struct {
 	// Data is the base64 encoded audio data
 	Data string `json:"data" binding:"required"`
 	// Format is the audio format, should be one of the
 	// following: mp3/mp4/mpeg/mpga/m4a/wav/webm/pcm16.
 	// When stream=true, format should be pcm16
 	Format string `json:"format"`
 }