From 9990a8969832f100f2ee39c4168f381eb7633890 Mon Sep 17 00:00:00 2001 From: EvanWu <850123119@qq.com> Date: Wed, 30 Jul 2025 21:30:49 +0800 Subject: [PATCH 01/13] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=20TTS=20?= =?UTF-8?q?=E5=BC=95=E6=93=8E=E9=85=8D=E7=BD=AE=EF=BC=8C=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E9=98=BF=E9=87=8C=E5=B7=B4=E5=B7=B4=E8=AF=AD=E9=9F=B3=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3=EF=BC=8C=E6=94=AF=E6=8C=81=E5=AE=9E=E6=97=B6=E8=AF=AD?= =?UTF-8?q?=E9=9F=B3=E5=90=88=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .yarnrc.yml | 1 + app/client/platforms/alibaba.ts | 132 +++++++++++++++++- app/components/chat.tsx | 5 +- .../realtime-chat/realtime-config.tsx | 8 +- app/components/tts-config.tsx | 36 +++-- app/constant.ts | 69 +++++++-- app/store/config.ts | 17 ++- app/utils/audio.ts | 12 +- package.json | 6 +- 9 files changed, 241 insertions(+), 45 deletions(-) create mode 100644 .yarnrc.yml diff --git a/.yarnrc.yml b/.yarnrc.yml new file mode 100644 index 000000000..3186f3f07 --- /dev/null +++ b/.yarnrc.yml @@ -0,0 +1 @@ +nodeLinker: node-modules diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 4875e5c02..19d020ddc 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -1,5 +1,5 @@ "use client"; -import { ApiPath, Alibaba, ALIBABA_BASE_URL } from "@/app/constant"; +import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant"; import { useAccessStore, useAppConfig, @@ -89,8 +89,66 @@ export class QwenApi implements LLMApi { return res?.output?.choices?.at(0)?.message?.content ?? ""; } - speech(options: SpeechOptions): Promise { - throw new Error("Method not implemented."); + async speech(options: SpeechOptions): Promise { + const requestPayload = { + model: options.model, + input: { + text: options.input, + voice: options.voice, + }, + speed: options.speed, + response_format: options.response_format, + }; + console.log("[Request] alibaba speech payload: ", requestPayload); + const controller = new AbortController(); + options.onController?.(controller); + try { + const speechPath = this.path(Alibaba.SpeechPath); + const speechPayload = { + method: "POST", + body: JSON.stringify(requestPayload), + signal: controller.signal, + headers: { + ...getHeaders(), + "X-DashScope-SSE": "enable", + }, + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); + + const res = await fetch(speechPath, speechPayload); + + const reader = res.body!.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + let base64 = ""; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buffer += decoder.decode(value, { stream: true, }); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; + + for (const line of lines) { + if (line.startsWith('data:')) { + const data = line.slice(5); + const json = JSON.parse(data); + base64 += json.output.audio.data; + } + } + } + const audioBuffer = await this.PCMBase64ToAudioBuffer(base64); + clearTimeout(requestTimeoutId); + reader.releaseLock(); + return audioBuffer; + } catch (e) { + console.log("[Request] failed to make a speech request", e); + throw e; + } } async chat(options: ChatOptions) { @@ -273,5 +331,71 @@ export class QwenApi implements LLMApi { async models(): Promise { return []; } + + // 播放 PCM base64 数据 + private async PCMBase64ToAudioBuffer(base64Data: string) { + try { + // 解码 base64 + const binaryString = atob(base64Data); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + + // 转换为 AudioBuffer + const audioBuffer = await this.convertToAudioBuffer(bytes); + + return audioBuffer; + } catch (error) { + console.error('播放 PCM 数据失败:', error); + throw error; + } + } + + // 将 PCM 字节数据转换为 AudioBuffer + private convertToAudioBuffer(pcmData: Uint8Array) { + const audioContext = new (window.AudioContext || window.webkitAudioContext)(); + const channels = 1; + const sampleRate = 24000; + return new Promise((resolve, reject) => { + try { + let float32Array; + // 16位 PCM 转换为 32位浮点数 + float32Array = this.pcm16ToFloat32(pcmData); + + // 创建 AudioBuffer + const audioBuffer = audioContext.createBuffer( + channels, + float32Array.length / channels, + sampleRate + ); + + // 复制数据到 AudioBuffer + for (let channel = 0; channel < channels; channel++) { + const channelData = audioBuffer.getChannelData(channel); + for (let i = 0; i < channelData.length; i++) { + channelData[i] = float32Array[i * channels + channel]; + } + } + + resolve(audioBuffer); + } catch (error) { + reject(error); + } + }); + } + // 16位 PCM 转 32位浮点数 + private pcm16ToFloat32(pcmData: Uint8Array) { + const length = pcmData.length / 2; + const float32Array = new Float32Array(length); + + for (let i = 0; i < length; i++) { + const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2]; + const int16Signed = int16 > 32767 ? int16 - 65536 : int16; + float32Array[i] = int16Signed / 32768; + } + + return float32Array; + } } -export { Alibaba }; +export { Alibaba }; \ No newline at end of file diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 6691403e6..97e58da98 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -1293,14 +1293,15 @@ function _Chat() { setSpeechStatus(false); } else { var api: ClientApi; - api = new ClientApi(ModelProvider.GPT); const config = useAppConfig.getState(); + api = new ClientApi(config.ttsConfig.modelProvider); setSpeechLoading(true); ttsPlayer.init(); let audioBuffer: ArrayBuffer; const { markdownToTxt } = require("markdown-to-txt"); const textContent = markdownToTxt(text); - if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) { + console.log("[OpenAI Speech] textContent: ", config, textContent); + if (config.ttsConfig.engine === "Edge") { const edgeVoiceName = accessStore.edgeVoiceName(); const tts = new MsEdgeTTS(); await tts.setMetadata( diff --git a/app/components/realtime-chat/realtime-config.tsx b/app/components/realtime-chat/realtime-config.tsx index 08809afda..18887b378 100644 --- a/app/components/realtime-chat/realtime-config.tsx +++ b/app/components/realtime-chat/realtime-config.tsx @@ -5,13 +5,13 @@ import { ListItem, Select, PasswordInput } from "@/app/components/ui-lib"; import { InputRange } from "@/app/components/input-range"; import { Voice } from "rt-client"; -import { ServiceProvider } from "@/app/constant"; +import { REALTIME_TTS_MODELS, ServiceProvider } from "@/app/constant"; -const providers = [ServiceProvider.OpenAI, ServiceProvider.Azure]; +const providers = Object.keys(REALTIME_TTS_MODELS) as ServiceProvider[]; -const models = ["gpt-4o-realtime-preview-2024-10-01"]; +const models = ["gpt-4o-realtime-preview-2024-10-01", "qwen-tts-realtime"]; -const voice = ["alloy", "shimmer", "echo"]; +const voice = ["alloy", "shimmer", "echo","Chelsie"]; export function RealtimeConfigList(props: { realtimeConfig: RealtimeConfig; diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx index 39ae85730..a0ad22fa0 100644 --- a/app/components/tts-config.tsx +++ b/app/components/tts-config.tsx @@ -3,10 +3,9 @@ import { TTSConfig, TTSConfigValidator } from "../store"; import Locale from "../locales"; import { ListItem, Select } from "./ui-lib"; import { - DEFAULT_TTS_ENGINE, - DEFAULT_TTS_ENGINES, - DEFAULT_TTS_MODELS, - DEFAULT_TTS_VOICES, + ServiceProvider, + TTS_CONFIGS, + TTSEngineType } from "../constant"; import { InputRange } from "./input-range"; @@ -48,22 +47,33 @@ export function TTSConfigList(props: { - {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && ( + {(props.ttsConfig.engine === ServiceProvider.OpenAI || + props.ttsConfig.engine === ServiceProvider.Alibaba) && ( <>