one-api/providers/azureSpeech/speech.go

135 lines
3.3 KiB
Go

package azureSpeech
import (
"bytes"
"fmt"
"net/http"
"one-api/common"
"one-api/types"
"strings"
)
var outputFormatMap = map[string]string{
"mp3": "audio-16khz-128kbitrate-mono-mp3",
"opus": "audio-16khz-128kbitrate-mono-opus",
"aac": "audio-24khz-160kbitrate-mono-mp3",
"flac": "audio-48khz-192kbitrate-mono-mp3",
}
func CreateSSML(text string, name string, role string) string {
ssmlTemplate := `<speak version='1.0' xml:lang='en-US'>
<voice xml:lang='en-US' %s name='%s'>
%s
</voice>
</speak>`
roleAttribute := ""
if role != "" {
roleAttribute = fmt.Sprintf("role='%s'", role)
}
return fmt.Sprintf(ssmlTemplate, roleAttribute, name, text)
}
func getAzureVoiceMap(modelName string) (voice, role string) {
voiceMap := map[string][]string{
"alloy": {"zh-CN-YunxiNeural"},
"echo": {"zh-CN-YunyangNeural"},
"fable": {"zh-CN-YunxiNeural", "Boy"},
"onyx": {"zh-CN-YunyeNeural"},
"nova": {"zh-CN-XiaochenNeural"},
"shimmer": {"zh-CN-XiaohanNeural"},
}
if voiceMap[modelName] != nil {
voice = voiceMap[modelName][0]
if len(voiceMap[modelName]) > 1 {
role = voiceMap[modelName][1]
}
}
return
}
func (p *AzureSpeechProvider) GetVoiceMap() map[string][]string {
voiceMap := map[string][]string{
"alloy": {"zh-CN-YunxiNeural"},
"echo": {"zh-CN-YunyangNeural"},
"fable": {"zh-CN-YunxiNeural", "Boy"},
"onyx": {"zh-CN-YunyeNeural"},
"nova": {"zh-CN-XiaochenNeural"},
"shimmer": {"zh-CN-XiaohanNeural"},
}
if p.Channel.Plugin == nil {
return voiceMap
}
customizeMap, ok := p.Channel.Plugin.Data()["voice"]
if !ok {
return voiceMap
}
for k, v := range customizeMap {
if _, ok := voiceMap[k]; !ok {
continue
}
customizeValue, ok := v.(string)
if !ok {
continue
}
customizeVoice := strings.Split(customizeValue, "|")
voiceMap[k] = customizeVoice
}
return voiceMap
}
func (p *AzureSpeechProvider) getRequestBody(request *types.SpeechAudioRequest) *bytes.Buffer {
var voice, role string
voiceMap := p.GetVoiceMap()
if voiceMap[request.Voice] != nil {
voice = voiceMap[request.Voice][0]
if len(voiceMap[request.Voice]) > 1 {
role = voiceMap[request.Voice][1]
}
}
ssml := CreateSSML(request.Input, voice, role)
return bytes.NewBufferString(ssml)
}
func (p *AzureSpeechProvider) CreateSpeech(request *types.SpeechAudioRequest) (*http.Response, *types.OpenAIErrorWithStatusCode) {
url, errWithCode := p.GetSupportedAPIUri(common.RelayModeAudioSpeech)
if errWithCode != nil {
return nil, errWithCode
}
fullRequestURL := p.GetFullRequestURL(url, request.Model)
headers := p.GetRequestHeaders()
responseFormatr := outputFormatMap[request.ResponseFormat]
if responseFormatr == "" {
responseFormatr = outputFormatMap["mp3"]
}
headers["X-Microsoft-OutputFormat"] = responseFormatr
requestBody := p.getRequestBody(request)
req, err := p.Requester.NewRequest(http.MethodPost, fullRequestURL, p.Requester.WithBody(requestBody), p.Requester.WithHeader(headers))
if err != nil {
return nil, common.ErrorWrapper(err, "new_request_failed", http.StatusInternalServerError)
}
defer req.Body.Close()
var resp *http.Response
resp, errWithCode = p.Requester.SendRequestRaw(req)
if errWithCode != nil {
return nil, errWithCode
}
p.Usage.TotalTokens = p.Usage.PromptTokens
return resp, nil
}