diff --git a/app/client/api.ts b/app/client/api.ts index f60b0e2ad..9b82959a8 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -107,7 +107,8 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; - abstract speech(options: SpeechOptions): Promise; + abstract speech(options: SpeechOptions): Promise; + abstract streamSpeech?(options: SpeechOptions): AsyncGenerator; abstract usage(): Promise; abstract models(): Promise; } diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 19d020ddc..7427eb818 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -1,5 +1,10 @@ "use client"; -import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant"; +import { + ApiPath, + Alibaba, + ALIBABA_BASE_URL, + REQUEST_TIMEOUT_MS, +} from "@/app/constant"; import { useAccessStore, useAppConfig, @@ -89,66 +94,71 @@ export class QwenApi implements LLMApi { return res?.output?.choices?.at(0)?.message?.content ?? ""; } - async speech(options: SpeechOptions): Promise { + async speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + + async *streamSpeech(options: SpeechOptions): AsyncGenerator { const requestPayload = { - model: options.model, - input: { - text: options.input, - voice: options.voice, - }, - speed: options.speed, - response_format: options.response_format, + model: options.model, + input: { + text: options.input, + voice: options.voice, + }, + speed: options.speed, + response_format: options.response_format, }; console.log("[Request] alibaba speech payload: ", requestPayload); const controller = new AbortController(); options.onController?.(controller); try { - const speechPath = this.path(Alibaba.SpeechPath); - const speechPayload = { - method: "POST", - body: JSON.stringify(requestPayload), - signal: controller.signal, - headers: { - ...getHeaders(), - "X-DashScope-SSE": "enable", - }, - }; - - // make a fetch request - const requestTimeoutId = setTimeout( - () => controller.abort(), - REQUEST_TIMEOUT_MS, - ); - - const res = await fetch(speechPath, speechPayload); + const speechPath = this.path(Alibaba.SpeechPath); + const speechPayload = { + method: "POST", + body: JSON.stringify(requestPayload), + signal: controller.signal, + headers: { + ...getHeaders(), + "X-DashScope-SSE": "enable", + }, + }; - const reader = res.body!.getReader(); - const decoder = new TextDecoder(); - let buffer = ""; - let base64 = ""; - while (true) { - const { done, value } = await reader.read(); - if (done) break; - buffer += decoder.decode(value, { stream: true, }); - const lines = buffer.split('\n'); - buffer = lines.pop() || ''; + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); - for (const line of lines) { - if (line.startsWith('data:')) { - const data = line.slice(5); - const json = JSON.parse(data); - base64 += json.output.audio.data; - } - } + const res = await fetch(speechPath, speechPayload); + + const reader = res.body!.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + while (true) { + const { done, value } = await reader.read(); + if (done) { + break; + } + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + + for (const line of lines) { + if (line.startsWith("data:")) { + const data = line.slice(5); + const json = JSON.parse(data); + if (json.output.audio.data) { + yield this.PCMBase64ToAudioBuffer(json.output.audio.data); + } + } } - const audioBuffer = await this.PCMBase64ToAudioBuffer(base64); - clearTimeout(requestTimeoutId); - reader.releaseLock(); - return audioBuffer; - } catch (e) { - console.log("[Request] failed to make a speech request", e); - throw e; } + clearTimeout(requestTimeoutId); + reader.releaseLock(); + } catch (e) { + console.log("[Request] failed to make a speech request", e); + throw e; + } } async chat(options: ChatOptions) { @@ -335,67 +345,68 @@ export class QwenApi implements LLMApi { // 播放 PCM base64 数据 private async PCMBase64ToAudioBuffer(base64Data: string) { try { - // 解码 base64 - const binaryString = atob(base64Data); - const bytes = new Uint8Array(binaryString.length); - for (let i = 0; i < binaryString.length; i++) { - bytes[i] = binaryString.charCodeAt(i); - } + // 解码 base64 + const binaryString = atob(base64Data); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } - // 转换为 AudioBuffer - const audioBuffer = await this.convertToAudioBuffer(bytes); - - return audioBuffer; + // 转换为 AudioBuffer + const audioBuffer = await this.convertToAudioBuffer(bytes); + + return audioBuffer; } catch (error) { - console.error('播放 PCM 数据失败:', error); - throw error; + console.error("播放 PCM 数据失败:", error); + throw error; } } - - // 将 PCM 字节数据转换为 AudioBuffer - private convertToAudioBuffer(pcmData: Uint8Array) { - const audioContext = new (window.AudioContext || window.webkitAudioContext)(); + + // 将 PCM 字节数据转换为 AudioBuffer + private convertToAudioBuffer(pcmData: Uint8Array) { + const audioContext = new (window.AudioContext || + window.webkitAudioContext)(); const channels = 1; const sampleRate = 24000; return new Promise((resolve, reject) => { - try { - let float32Array; - // 16位 PCM 转换为 32位浮点数 - float32Array = this.pcm16ToFloat32(pcmData); + try { + let float32Array; + // 16位 PCM 转换为 32位浮点数 + float32Array = this.pcm16ToFloat32(pcmData); - // 创建 AudioBuffer - const audioBuffer = audioContext.createBuffer( - channels, - float32Array.length / channels, - sampleRate - ); + // 创建 AudioBuffer + const audioBuffer = audioContext.createBuffer( + channels, + float32Array.length / channels, + sampleRate, + ); - // 复制数据到 AudioBuffer - for (let channel = 0; channel < channels; channel++) { - const channelData = audioBuffer.getChannelData(channel); - for (let i = 0; i < channelData.length; i++) { - channelData[i] = float32Array[i * channels + channel]; - } - } - - resolve(audioBuffer); - } catch (error) { - reject(error); + // 复制数据到 AudioBuffer + for (let channel = 0; channel < channels; channel++) { + const channelData = audioBuffer.getChannelData(channel); + for (let i = 0; i < channelData.length; i++) { + channelData[i] = float32Array[i * channels + channel]; + } } + + resolve(audioBuffer); + } catch (error) { + reject(error); + } }); } - // 16位 PCM 转 32位浮点数 - private pcm16ToFloat32(pcmData: Uint8Array) { - const length = pcmData.length / 2; - const float32Array = new Float32Array(length); - - for (let i = 0; i < length; i++) { - const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2]; - const int16Signed = int16 > 32767 ? int16 - 65536 : int16; - float32Array[i] = int16Signed / 32768; - } - - return float32Array; + // 16位 PCM 转 32位浮点数 + private pcm16ToFloat32(pcmData: Uint8Array) { + const length = pcmData.length / 2; + const float32Array = new Float32Array(length); + + for (let i = 0; i < length; i++) { + const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2]; + const int16Signed = int16 > 32767 ? int16 - 65536 : int16; + float32Array[i] = int16Signed / 32768; } + + return float32Array; + } } -export { Alibaba }; \ No newline at end of file +export { Alibaba }; diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 97e58da98..16a2a01b5 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -101,8 +101,6 @@ import { import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, - DEFAULT_TTS_ENGINE, - ModelProvider, Path, REQUEST_TIMEOUT_MS, ServiceProvider, @@ -1286,6 +1284,7 @@ function _Chat() { const accessStore = useAccessStore(); const [speechStatus, setSpeechStatus] = useState(false); const [speechLoading, setSpeechLoading] = useState(false); + const [speechCooldown, setSpeechCooldown] = useState(false); async function openaiSpeech(text: string) { if (speechStatus) { @@ -1297,10 +1296,10 @@ function _Chat() { api = new ClientApi(config.ttsConfig.modelProvider); setSpeechLoading(true); ttsPlayer.init(); - let audioBuffer: ArrayBuffer; + let audioBuffer: ArrayBuffer | AudioBuffer; const { markdownToTxt } = require("markdown-to-txt"); const textContent = markdownToTxt(text); - console.log("[OpenAI Speech] textContent: ", config, textContent); + console.log("[OpenAI Speech] textContent: ", textContent); if (config.ttsConfig.engine === "Edge") { const edgeVoiceName = accessStore.edgeVoiceName(); const tts = new MsEdgeTTS(); @@ -1309,28 +1308,61 @@ function _Chat() { OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3, ); audioBuffer = await tts.toArrayBuffer(textContent); + playSpeech(audioBuffer); } else { - audioBuffer = await api.llm.speech({ - model: config.ttsConfig.model, - input: textContent, - voice: config.ttsConfig.voice, - speed: config.ttsConfig.speed, - }); + if (api.llm.streamSpeech) { + // 使用流式播放,边接收边播放 + setSpeechStatus(true); + ttsPlayer.startStreamPlay(() => { + setSpeechStatus(false); + }); + + try { + for await (const chunk of api.llm.streamSpeech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + })) { + console.log("[Stream Speech] add to queue", chunk); + ttsPlayer.addToQueue(chunk); + } + ttsPlayer.finishStreamPlay(); + } catch (e) { + console.error("[Stream Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + ttsPlayer.stop(); + } finally { + setSpeechLoading(false); + } + } else { + audioBuffer = await api.llm.speech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }); + playSpeech(audioBuffer); + } } - setSpeechStatus(true); - ttsPlayer - .play(audioBuffer, () => { - setSpeechStatus(false); - }) - .catch((e) => { - console.error("[OpenAI Speech]", e); - showToast(prettyObject(e)); - setSpeechStatus(false); - }) - .finally(() => setSpeechLoading(false)); } } + function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) { + setSpeechStatus(true); + ttsPlayer + .play(audioBuffer, () => { + setSpeechStatus(false); + }) + .catch((e) => { + console.error("[OpenAI Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + }) + .finally(() => setSpeechLoading(false)); + } + const context: RenderMessage[] = useMemo(() => { return session.mask.hideContext ? [] : session.mask.context.slice(); }, [session.mask.context, session.mask.hideContext]); diff --git a/app/utils/audio.ts b/app/utils/audio.ts index dd7eadc18..3d93f7bad 100644 --- a/app/utils/audio.ts +++ b/app/utils/audio.ts @@ -1,19 +1,38 @@ type TTSPlayer = { init: () => void; - play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise; + play: ( + audioBuffer: ArrayBuffer | AudioBuffer, + onended: () => void | null, + ) => Promise; + playQueue: ( + audioBuffers: (ArrayBuffer | AudioBuffer)[], + onended: () => void | null, + ) => Promise; + addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void; + startStreamPlay: (onended: () => void | null) => void; + finishStreamPlay: () => void; stop: () => void; }; export function createTTSPlayer(): TTSPlayer { let audioContext: AudioContext | null = null; let audioBufferSourceNode: AudioBufferSourceNode | null = null; + let isPlaying = false; + let playQueue: (ArrayBuffer | AudioBuffer)[] = []; + let currentOnended: (() => void | null) | null = null; + let isStreamMode = false; + let streamFinished = false; const init = () => { + console.log("[TTSPlayer] init"); audioContext = new (window.AudioContext || window.webkitAudioContext)(); audioContext.suspend(); }; - const play = async (audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null) => { + const play = async ( + audioBuffer: ArrayBuffer | AudioBuffer, + onended: () => void | null, + ) => { if (audioBufferSourceNode) { audioBufferSourceNode.stop(); audioBufferSourceNode.disconnect(); @@ -33,17 +52,109 @@ export function createTTSPlayer(): TTSPlayer { audioBufferSourceNode.onended = onended; }; - const stop = () => { + const playNext = async () => { + if (playQueue.length === 0) { + // 在流模式下,如果队列为空但流还没结束,等待 + if (isStreamMode && !streamFinished) { + setTimeout(() => playNext(), 100); + return; + } + + isPlaying = false; + isStreamMode = false; + streamFinished = false; + if (currentOnended) { + currentOnended(); + currentOnended = null; + } + return; + } + + const nextBuffer = playQueue.shift()!; + let buffer: AudioBuffer; + if (nextBuffer instanceof AudioBuffer) { + buffer = nextBuffer; + } else { + buffer = await audioContext!.decodeAudioData(nextBuffer); + } + + if (audioBufferSourceNode) { + audioBufferSourceNode.stop(); + audioBufferSourceNode.disconnect(); + } + + audioBufferSourceNode = audioContext!.createBufferSource(); + audioBufferSourceNode.buffer = buffer; + audioBufferSourceNode.connect(audioContext!.destination); + audioBufferSourceNode.onended = () => { + playNext(); + }; + + await audioContext!.resume(); + audioBufferSourceNode.start(); + }; + + const playQueueMethod = async ( + audioBuffers: (ArrayBuffer | AudioBuffer)[], + onended: () => void | null, + ) => { + playQueue = [...audioBuffers]; + currentOnended = onended; + if (!isPlaying) { + isPlaying = true; + await playNext(); + } + }; + + const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => { + if (streamFinished) { + return; + } + playQueue.push(audioBuffer); + }; + + const startStreamPlay = (onended: () => void | null) => { + isStreamMode = true; + streamFinished = false; + playQueue = []; + currentOnended = onended; + + if (!isPlaying) { + isPlaying = true; + playNext(); + } + }; + + const finishStreamPlay = () => { + streamFinished = true; + }; + + const stop = async () => { + console.log("[TTSPlayer] stop"); + playQueue = []; + isPlaying = false; + isStreamMode = false; + streamFinished = true; + currentOnended = null; + if (audioBufferSourceNode) { audioBufferSourceNode.stop(); audioBufferSourceNode.disconnect(); audioBufferSourceNode = null; } if (audioContext) { - audioContext.close(); + await audioContext.close(); audioContext = null; } }; - return { init, play, stop }; -} \ No newline at end of file + return { + init, + play, + playQueue: playQueueMethod, + addToQueue, + startStreamPlay, + finishStreamPlay, + stop, + }; +}