feat: support gpt-4o-audio

2025-12-27 18:25:57 +08:00 · 2025-01-14 06:38:07 +00:00
parent b5c55a03cf
commit 9730ac3bb5
14 changed files with 378 additions and 189 deletions
--- a/relay/model/general.go
+++ b/relay/model/general.go
@@ -23,36 +23,37 @@ type StreamOptions struct {

 type GeneralOpenAIRequest struct {
 	// https://platform.openai.com/docs/api-reference/chat/create
-	Messages            []Message       `json:"messages,omitempty"`
-	Model               string          `json:"model,omitempty"`
-	Store               *bool           `json:"store,omitempty"`
-	Metadata            any             `json:"metadata,omitempty"`
-	FrequencyPenalty    *float64        `json:"frequency_penalty,omitempty"`
-	LogitBias           any             `json:"logit_bias,omitempty"`
-	Logprobs            *bool           `json:"logprobs,omitempty"`
-	TopLogprobs         *int            `json:"top_logprobs,omitempty"`
-	MaxTokens           int             `json:"max_tokens,omitempty"`
-	MaxCompletionTokens *int            `json:"max_completion_tokens,omitempty"`
-	N                   int             `json:"n,omitempty"`
-	Modalities          []string        `json:"modalities,omitempty"`
-	Prediction          any             `json:"prediction,omitempty"`
-	Audio               *Audio          `json:"audio,omitempty"`
-	PresencePenalty     *float64        `json:"presence_penalty,omitempty"`
-	ResponseFormat      *ResponseFormat `json:"response_format,omitempty"`
-	Seed                float64         `json:"seed,omitempty"`
-	ServiceTier         *string         `json:"service_tier,omitempty"`
-	Stop                any             `json:"stop,omitempty"`
-	Stream              bool            `json:"stream,omitempty"`
-	StreamOptions       *StreamOptions  `json:"stream_options,omitempty"`
-	Temperature         *float64        `json:"temperature,omitempty"`
-	TopP                *float64        `json:"top_p,omitempty"`
-	TopK                int             `json:"top_k,omitempty"`
-	Tools               []Tool          `json:"tools,omitempty"`
-	ToolChoice          any             `json:"tool_choice,omitempty"`
-	ParallelTooCalls    *bool           `json:"parallel_tool_calls,omitempty"`
-	User                string          `json:"user,omitempty"`
-	FunctionCall        any             `json:"function_call,omitempty"`
-	Functions           any             `json:"functions,omitempty"`
+	Messages            []Message `json:"messages,omitempty"`
+	Model               string    `json:"model,omitempty"`
+	Store               *bool     `json:"store,omitempty"`
+	Metadata            any       `json:"metadata,omitempty"`
+	FrequencyPenalty    *float64  `json:"frequency_penalty,omitempty"`
+	LogitBias           any       `json:"logit_bias,omitempty"`
+	Logprobs            *bool     `json:"logprobs,omitempty"`
+	TopLogprobs         *int      `json:"top_logprobs,omitempty"`
+	MaxTokens           int       `json:"max_tokens,omitempty"`
+	MaxCompletionTokens *int      `json:"max_completion_tokens,omitempty"`
+	N                   int       `json:"n,omitempty"`
+	// Modalities currently the model only programmatically allows modalities = [“text”, “audio”]
+	Modalities       []string        `json:"modalities,omitempty"`
+	Prediction       any             `json:"prediction,omitempty"`
+	Audio            *Audio          `json:"audio,omitempty"`
+	PresencePenalty  *float64        `json:"presence_penalty,omitempty"`
+	ResponseFormat   *ResponseFormat `json:"response_format,omitempty"`
+	Seed             float64         `json:"seed,omitempty"`
+	ServiceTier      *string         `json:"service_tier,omitempty"`
+	Stop             any             `json:"stop,omitempty"`
+	Stream           bool            `json:"stream,omitempty"`
+	StreamOptions    *StreamOptions  `json:"stream_options,omitempty"`
+	Temperature      *float64        `json:"temperature,omitempty"`
+	TopP             *float64        `json:"top_p,omitempty"`
+	TopK             int             `json:"top_k,omitempty"`
+	Tools            []Tool          `json:"tools,omitempty"`
+	ToolChoice       any             `json:"tool_choice,omitempty"`
+	ParallelTooCalls *bool           `json:"parallel_tool_calls,omitempty"`
+	User             string          `json:"user,omitempty"`
+	FunctionCall     any             `json:"function_call,omitempty"`
+	Functions        any             `json:"functions,omitempty"`
 	// https://platform.openai.com/docs/api-reference/embeddings/create
 	Input          any    `json:"input,omitempty"`
 	EncodingFormat string `json:"encoding_format,omitempty"`
--- a/relay/model/message.go
+++ b/relay/model/message.go
@@ -1,11 +1,26 @@
 package model

+import (
+	"context"
+
+	"github.com/songquanpeng/one-api/common/logger"
+)
+
 type Message struct {
-	Role       string  `json:"role,omitempty"`
-	Content    any     `json:"content,omitempty"`
-	Name       *string `json:"name,omitempty"`
-	ToolCalls  []Tool  `json:"tool_calls,omitempty"`
-	ToolCallId string  `json:"tool_call_id,omitempty"`
+	Role string `json:"role,omitempty"`
+	// Content is a string or a list of objects
+	Content    any           `json:"content,omitempty"`
+	Name       *string       `json:"name,omitempty"`
+	ToolCalls  []Tool        `json:"tool_calls,omitempty"`
+	ToolCallId string        `json:"tool_call_id,omitempty"`
+	Audio      *messageAudio `json:"audio,omitempty"`
+}
+
+type messageAudio struct {
+	Id         string `json:"id"`
+	Data       string `json:"data,omitempty"`
+	ExpiredAt  int    `json:"expired_at,omitempty"`
+	Transcript string `json:"transcript,omitempty"`
 }

 func (m Message) IsStringContent() bool {
@@ -26,6 +41,7 @@ func (m Message) StringContent() string {
 			if !ok {
 				continue
 			}
+
 			if contentMap["type"] == ContentTypeText {
 				if subStr, ok := contentMap["text"].(string); ok {
 					contentStr += subStr
@@ -34,6 +50,7 @@ func (m Message) StringContent() string {
 		}
 		return contentStr
 	}
+
 	return ""
 }

@@ -47,6 +64,7 @@ func (m Message) ParseContent() []MessageContent {
 		})
 		return contentList
 	}
+
 	anyList, ok := m.Content.([]any)
 	if ok {
 		for _, contentItem := range anyList {
@@ -71,8 +89,21 @@ func (m Message) ParseContent() []MessageContent {
 						},
 					})
 				}
+			case ContentTypeInputAudio:
+				if subObj, ok := contentMap["input_audio"].(map[string]any); ok {
+					contentList = append(contentList, MessageContent{
+						Type: ContentTypeInputAudio,
+						InputAudio: &InputAudio{
+							Data:   subObj["data"].(string),
+							Format: subObj["format"].(string),
+						},
+					})
+				}
+			default:
+				logger.Warnf(context.TODO(), "unknown content type: %s", contentMap["type"])
 			}
 		}
+
 		return contentList
 	}
 	return nil
@@ -84,7 +115,18 @@ type ImageURL struct {
 }

 type MessageContent struct {
-	Type     string    `json:"type,omitempty"`
-	Text     string    `json:"text"`
-	ImageURL *ImageURL `json:"image_url,omitempty"`
+	// Type should be one of the following: text/input_audio
+	Type       string      `json:"type,omitempty"`
+	Text       string      `json:"text"`
+	ImageURL   *ImageURL   `json:"image_url,omitempty"`
+	InputAudio *InputAudio `json:"input_audio,omitempty"`
+}
+
+type InputAudio struct {
+	// Data is the base64 encoded audio data
+	Data string `json:"data" binding:"required"`
+	// Format is the audio format, should be one of the
+	// following: mp3/mp4/mpeg/mpga/m4a/wav/webm/pcm16.
+	// When stream=true, format should be pcm16
+	Format string `json:"format"`
 }
--- a/relay/model/misc.go
+++ b/relay/model/misc.go
@@ -1,9 +1,13 @@
 package model

 type Usage struct {
-	PromptTokens     int `json:"prompt_tokens"`
-	CompletionTokens int `json:"completion_tokens"`
-	TotalTokens      int `json:"total_tokens"`
+	PromptTokens            int                          `json:"prompt_tokens"`
+	CompletionTokens        int                          `json:"completion_tokens"`
+	TotalTokens             int                          `json:"total_tokens"`
+	PromptTokensDetails     usagePromptTokensDetails     `gorm:"-" json:"prompt_tokens_details,omitempty"`
+	CompletionTokensDetails usageCompletionTokensDetails `gorm:"-" json:"completion_tokens_details,omitempty"`
+	ServiceTier             string                       `gorm:"-" json:"service_tier,omitempty"`
+	SystemFingerprint       string                       `gorm:"-" json:"system_fingerprint,omitempty"`
 }

 type Error struct {
@@ -17,3 +21,18 @@ type ErrorWithStatusCode struct {
 	Error
 	StatusCode int `json:"status_code"`
 }
+
+type usagePromptTokensDetails struct {
+	CachedTokens int `json:"cached_tokens"`
+	AudioTokens  int `json:"audio_tokens"`
+	TextTokens   int `json:"text_tokens"`
+	ImageTokens  int `json:"image_tokens"`
+}
+
+type usageCompletionTokensDetails struct {
+	ReasoningTokens          int `json:"reasoning_tokens"`
+	AudioTokens              int `json:"audio_tokens"`
+	AcceptedPredictionTokens int `json:"accepted_prediction_tokens"`
+	RejectedPredictionTokens int `json:"rejected_prediction_tokens"`
+	TextTokens               int `json:"text_tokens"`
+}