From bf999b91a5f5cccc52e678af28e3e7ed984cc2c4 Mon Sep 17 00:00:00 2001 From: EvanWu <850123119@qq.com> Date: Thu, 21 Aug 2025 13:47:58 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E9=9F=B3=E9=A2=91?= =?UTF-8?q?=E6=92=AD=E6=94=BE=E7=AE=A1=E7=90=86=EF=BC=8C=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=20TTSPlayManager=20=E7=B1=BB=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E6=B5=81=E5=BC=8F=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=E9=80=BB?= =?UTF-8?q?=E8=BE=91=EF=BC=8C=E6=94=AF=E6=8C=81=20PCM=20=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=92=8C=20base64=20=E8=BD=AC=E6=8D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/client/api.ts | 6 +- app/client/platforms/alibaba.ts | 99 +++------- app/components/chat.tsx | 15 +- app/utils/audio.ts | 330 ++++++++++++++++++++++---------- 4 files changed, 266 insertions(+), 184 deletions(-) diff --git a/app/client/api.ts b/app/client/api.ts index 9b82959a8..00348548c 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -25,6 +25,7 @@ import { XAIApi } from "./platforms/xai"; import { ChatGLMApi } from "./platforms/glm"; import { SiliconflowApi } from "./platforms/siliconflow"; import { Ai302Api } from "./platforms/ai302"; +import type { TTSPlayManager } from "../utils/audio"; export const ROLES = ["system", "user", "assistant"] as const; export type MessageRole = (typeof ROLES)[number]; @@ -108,7 +109,10 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; abstract speech(options: SpeechOptions): Promise; - abstract streamSpeech?(options: SpeechOptions): AsyncGenerator; + abstract streamSpeech?( + options: SpeechOptions, + audioManager?: TTSPlayManager, + ): AsyncGenerator; abstract usage(): Promise; abstract models(): Promise; } diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index fdca6f295..0e7e49e86 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -8,6 +8,7 @@ import { usePluginStore, FunctionToolItem, } from "@/app/store"; +import { TTSPlayManager } from "@/app/utils/audio"; import { preProcessImageContentForAlibabaDashScope, streamWithThink, @@ -62,7 +63,6 @@ interface RequestPayload { } export class QwenApi implements LLMApi { - private static audioContext: AudioContext | null = null; path(path: string): string { const accessStore = useAccessStore.getState(); @@ -97,7 +97,10 @@ export class QwenApi implements LLMApi { throw new Error("Method not implemented."); } - async *streamSpeech(options: SpeechOptions): AsyncGenerator { + async *streamSpeech( + options: SpeechOptions, + audioManager?: TTSPlayManager, + ): AsyncGenerator { if (!options.input || !options.model) { throw new Error("Missing required parameters: input and model"); } @@ -112,6 +115,10 @@ export class QwenApi implements LLMApi { }; const controller = new AbortController(); options.onController?.(controller); + + if (audioManager) { + audioManager.setStreamController(controller); + } try { const speechPath = this.path(Alibaba.SpeechPath); const speechPayload = { @@ -151,7 +158,10 @@ export class QwenApi implements LLMApi { if (line.startsWith("data:")) { const json = JSON.parse(data); if (json.output?.audio?.data) { - yield this.PCMBase64ToAudioBuffer(json.output.audio.data); + yield await audioManager!.pcmBase64ToAudioBuffer( + json.output.audio.data, + { channels: 1, sampleRate: 24000, bitDepth: 16 }, + ); } } } catch (parseError) { @@ -165,8 +175,17 @@ export class QwenApi implements LLMApi { } reader.releaseLock(); } catch (e) { + // 如果是用户主动取消(AbortError),则不作为错误处理 + if (e instanceof Error && e.name === "AbortError") { + console.log("[Request] Stream speech was aborted by user"); + return; // 正常退出,不抛出错误 + } console.log("[Request] failed to make a speech request", e); throw e; + } finally { + if (audioManager) { + audioManager.clearStreamController(); + } } } @@ -356,79 +375,5 @@ export class QwenApi implements LLMApi { async models(): Promise { return []; } - - // 播放 PCM base64 数据 - private async PCMBase64ToAudioBuffer(base64Data: string) { - try { - // 解码 base64 - const binaryString = atob(base64Data); - const bytes = new Uint8Array(binaryString.length); - for (let i = 0; i < binaryString.length; i++) { - bytes[i] = binaryString.charCodeAt(i); - } - - // 转换为 AudioBuffer - const audioBuffer = await this.convertToAudioBuffer(bytes); - - return audioBuffer; - } catch (error) { - console.error("播放 PCM 数据失败:", error); - throw error; - } - } - - private static getAudioContext(): AudioContext { - if (!QwenApi.audioContext) { - QwenApi.audioContext = new (window.AudioContext || - window.webkitAudioContext)(); - } - return QwenApi.audioContext; - } - - // 将 PCM 字节数据转换为 AudioBuffer - private convertToAudioBuffer(pcmData: Uint8Array) { - const audioContext = QwenApi.getAudioContext(); - const channels = 1; - const sampleRate = 24000; - return new Promise((resolve, reject) => { - try { - let float32Array; - // 16位 PCM 转换为 32位浮点数 - float32Array = this.pcm16ToFloat32(pcmData); - - // 创建 AudioBuffer - const audioBuffer = audioContext.createBuffer( - channels, - float32Array.length / channels, - sampleRate, - ); - - // 复制数据到 AudioBuffer - for (let channel = 0; channel < channels; channel++) { - const channelData = audioBuffer.getChannelData(channel); - for (let i = 0; i < channelData.length; i++) { - channelData[i] = float32Array[i * channels + channel]; - } - } - - resolve(audioBuffer); - } catch (error) { - reject(error); - } - }); - } - // 16位 PCM 转 32位浮点数 - private pcm16ToFloat32(pcmData: Uint8Array) { - const length = pcmData.length / 2; - const float32Array = new Float32Array(length); - - for (let i = 0; i < length; i++) { - const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2]; - const int16Signed = int16 > 32767 ? int16 - 65536 : int16; - float32Array[i] = int16Signed / 32768; - } - - return float32Array; - } } export { Alibaba }; diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 5dbaeaa20..a2575124f 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -1340,12 +1340,15 @@ function _Chat() { }); try { - for await (const chunk of api.llm.streamSpeech({ - model: config.ttsConfig.model, - input: textContent, - voice: config.ttsConfig.voice, - speed: config.ttsConfig.speed, - })) { + for await (const chunk of api.llm.streamSpeech( + { + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }, + ttsPlayer, + )) { ttsPlayer.addToQueue(chunk); } ttsPlayer.finishStreamPlay(); diff --git a/app/utils/audio.ts b/app/utils/audio.ts index 3d93f7bad..2f5d4ff80 100644 --- a/app/utils/audio.ts +++ b/app/utils/audio.ts @@ -4,157 +4,287 @@ type TTSPlayer = { audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null, ) => Promise; - playQueue: ( + playQueueMethod: ( audioBuffers: (ArrayBuffer | AudioBuffer)[], onended: () => void | null, ) => Promise; addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void; startStreamPlay: (onended: () => void | null) => void; finishStreamPlay: () => void; + setStreamController: (controller: AbortController) => void; + clearStreamController: () => void; stop: () => void; + pcmBase64ToAudioBuffer: ( + base64Data: string, + config?: PCMConfig, + ) => Promise; + pcmDataToAudioBuffer: ( + pcmData: Uint8Array, + config?: PCMConfig, + ) => Promise; }; -export function createTTSPlayer(): TTSPlayer { - let audioContext: AudioContext | null = null; - let audioBufferSourceNode: AudioBufferSourceNode | null = null; - let isPlaying = false; - let playQueue: (ArrayBuffer | AudioBuffer)[] = []; - let currentOnended: (() => void | null) | null = null; - let isStreamMode = false; - let streamFinished = false; +// Audio processing utilities +export interface PCMConfig { + channels?: number; + sampleRate?: number; + bitDepth?: 16 | 24 | 32; +} - const init = () => { - console.log("[TTSPlayer] init"); - audioContext = new (window.AudioContext || window.webkitAudioContext)(); - audioContext.suspend(); - }; +export class TTSPlayManager implements TTSPlayer { + private static audioContext: AudioContext | null = null; + private audioBufferSourceNode: AudioBufferSourceNode | null = null; + private isPlaying = false; + private playQueue: (ArrayBuffer | AudioBuffer)[] = []; + private currentOnended: (() => void | null) | null = null; + private isStreamMode = false; + private streamFinished = false; + private streamController: AbortController | null = null; - const play = async ( + get getAudioContext() { + if (!TTSPlayManager.audioContext) { + TTSPlayManager.audioContext = new (window.AudioContext || + window.webkitAudioContext)(); + } + return TTSPlayManager.audioContext; + } + + init() { + console.log("[TTSPlayManager] init"); + if (TTSPlayManager.audioContext) { + return; + } + this.getAudioContext.suspend(); + } + + async play( audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null, - ) => { - if (audioBufferSourceNode) { - audioBufferSourceNode.stop(); - audioBufferSourceNode.disconnect(); + ) { + if (this.audioBufferSourceNode) { + this.audioBufferSourceNode.stop(); + this.audioBufferSourceNode.disconnect(); } let buffer: AudioBuffer; if (audioBuffer instanceof AudioBuffer) { buffer = audioBuffer; } else { - buffer = await audioContext!.decodeAudioData(audioBuffer); + buffer = await TTSPlayManager.audioContext!.decodeAudioData(audioBuffer); } - audioBufferSourceNode = audioContext!.createBufferSource(); - audioBufferSourceNode.buffer = buffer; - audioBufferSourceNode.connect(audioContext!.destination); - audioContext!.resume().then(() => { - audioBufferSourceNode!.start(); + this.audioBufferSourceNode = + TTSPlayManager.audioContext!.createBufferSource(); + this.audioBufferSourceNode.buffer = buffer; + this.audioBufferSourceNode.connect( + TTSPlayManager.audioContext!.destination, + ); + this.getAudioContext.resume().then(() => { + this.audioBufferSourceNode!.start(); }); - audioBufferSourceNode.onended = onended; - }; + this.audioBufferSourceNode.onended = onended; + } - const playNext = async () => { - if (playQueue.length === 0) { + async stop() { + console.log("[TTSPlayer] stop"); + + // 首先中断流式请求 + try { + if (this.streamController && !this.streamController.signal.aborted) { + console.log("[TTSPlayer] Aborting stream request"); + this.streamController.abort(); + } + } catch (e) { + // 忽略中断请求时的错误 + console.log("[TTSPlayer] Error while aborting stream:", e); + } + this.clearStreamController(); + + // 清理播放状态 + this.playQueue = []; + this.isPlaying = false; + this.isStreamMode = false; + this.streamFinished = true; + this.currentOnended = null; + + // 停止音频播放 + if (this.audioBufferSourceNode) { + this.audioBufferSourceNode.stop(); + this.audioBufferSourceNode.disconnect(); + this.audioBufferSourceNode = null; + } + + // 关闭音频上下文 + if (TTSPlayManager.audioContext) { + await TTSPlayManager.audioContext.close(); + TTSPlayManager.audioContext = null; + } + } + + async playNext() { + if (this.playQueue.length === 0) { // 在流模式下,如果队列为空但流还没结束,等待 - if (isStreamMode && !streamFinished) { - setTimeout(() => playNext(), 100); + if (this.isStreamMode && !this.streamFinished) { + setTimeout(() => this.playNext(), 100); return; } - isPlaying = false; - isStreamMode = false; - streamFinished = false; - if (currentOnended) { - currentOnended(); - currentOnended = null; + this.isPlaying = false; + this.isStreamMode = false; + this.streamFinished = false; + if (this.currentOnended) { + this.currentOnended(); + this.currentOnended = null; } return; } - const nextBuffer = playQueue.shift()!; + const nextBuffer = this.playQueue.shift()!; let buffer: AudioBuffer; if (nextBuffer instanceof AudioBuffer) { buffer = nextBuffer; } else { - buffer = await audioContext!.decodeAudioData(nextBuffer); + buffer = await this.getAudioContext.decodeAudioData(nextBuffer); } - if (audioBufferSourceNode) { - audioBufferSourceNode.stop(); - audioBufferSourceNode.disconnect(); + if (this.audioBufferSourceNode) { + this.audioBufferSourceNode.stop(); + this.audioBufferSourceNode.disconnect(); } - audioBufferSourceNode = audioContext!.createBufferSource(); - audioBufferSourceNode.buffer = buffer; - audioBufferSourceNode.connect(audioContext!.destination); - audioBufferSourceNode.onended = () => { - playNext(); + this.audioBufferSourceNode = this.getAudioContext.createBufferSource(); + this.audioBufferSourceNode.buffer = buffer; + this.audioBufferSourceNode.connect(this.getAudioContext.destination); + this.audioBufferSourceNode.onended = () => { + this.playNext(); }; - await audioContext!.resume(); - audioBufferSourceNode.start(); - }; + await this.getAudioContext.resume(); + this.audioBufferSourceNode.start(); + } - const playQueueMethod = async ( + async playQueueMethod( audioBuffers: (ArrayBuffer | AudioBuffer)[], onended: () => void | null, - ) => { - playQueue = [...audioBuffers]; - currentOnended = onended; - if (!isPlaying) { - isPlaying = true; - await playNext(); + ) { + this.playQueue = [...audioBuffers]; + this.currentOnended = onended; + if (!this.isPlaying) { + this.isPlaying = true; + await this.playNext(); } - }; + } - const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => { - if (streamFinished) { + addToQueue(audioBuffer: ArrayBuffer | AudioBuffer) { + if (this.streamFinished) { return; } - playQueue.push(audioBuffer); - }; + this.playQueue.push(audioBuffer); + } - const startStreamPlay = (onended: () => void | null) => { - isStreamMode = true; - streamFinished = false; - playQueue = []; - currentOnended = onended; - - if (!isPlaying) { - isPlaying = true; - playNext(); + startStreamPlay(onended: () => void | null) { + this.isStreamMode = true; + this.streamFinished = false; + this.playQueue = []; + this.currentOnended = onended; + if (!this.isPlaying) { + this.isPlaying = true; + this.playNext(); } - }; + } - const finishStreamPlay = () => { - streamFinished = true; - }; + finishStreamPlay() { + this.streamFinished = true; + } - const stop = async () => { - console.log("[TTSPlayer] stop"); - playQueue = []; - isPlaying = false; - isStreamMode = false; - streamFinished = true; - currentOnended = null; + // 设置流式请求控制器,用于在 stop 时中断请求 + setStreamController(controller: AbortController) { + this.streamController = controller; + } - if (audioBufferSourceNode) { - audioBufferSourceNode.stop(); - audioBufferSourceNode.disconnect(); - audioBufferSourceNode = null; + // 清除流式请求控制器 + clearStreamController() { + this.streamController = null; + } + + // 将 base64 PCM 数据转换为 AudioBuffer + async pcmBase64ToAudioBuffer( + base64Data: string, + config: PCMConfig = {}, + ): Promise { + try { + // 解码 base64 + const binaryString = atob(base64Data); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + + // 转换为 AudioBuffer + return await this.pcmDataToAudioBuffer(bytes, config); + } catch (error) { + console.error("Failed to convert PCM base64 to AudioBuffer:", error); + throw error; } - if (audioContext) { - await audioContext.close(); - audioContext = null; - } - }; + } - return { - init, - play, - playQueue: playQueueMethod, - addToQueue, - startStreamPlay, - finishStreamPlay, - stop, - }; + // 将 PCM 字节数据转换为 AudioBuffer + async pcmDataToAudioBuffer( + pcmData: Uint8Array, + config: PCMConfig = {}, + ): Promise { + const { channels = 1, sampleRate = 24000, bitDepth = 16 } = config; + + const audioContext = this.getAudioContext; + + return new Promise((resolve, reject) => { + try { + let float32Array: Float32Array; + + // 根据位深度选择转换方法 + switch (bitDepth) { + case 16: + float32Array = this.pcm16ToFloat32(pcmData); + break; + default: + throw new Error(`Unsupported bit depth: ${bitDepth}`); + } + + // 创建 AudioBuffer + const audioBuffer = audioContext.createBuffer( + channels, + float32Array.length / channels, + sampleRate, + ); + + // 复制数据到 AudioBuffer + for (let channel = 0; channel < channels; channel++) { + const channelData = audioBuffer.getChannelData(channel); + for (let i = 0; i < channelData.length; i++) { + channelData[i] = float32Array[i * channels + channel]; + } + } + + resolve(audioBuffer); + } catch (error) { + reject(error); + } + }); + } + + // 16位 PCM 转 32位浮点数 + pcm16ToFloat32(pcmData: Uint8Array): Float32Array { + const length = pcmData.length / 2; + const float32Array = new Float32Array(length); + + for (let i = 0; i < length; i++) { + const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2]; + const int16Signed = int16 > 32767 ? int16 - 65536 : int16; + float32Array[i] = int16Signed / 32768; + } + + return float32Array; + } +} + +export function createTTSPlayer(): TTSPlayManager { + return new TTSPlayManager(); }