From bf999b91a5f5cccc52e678af28e3e7ed984cc2c4 Mon Sep 17 00:00:00 2001
From: EvanWu <850123119@qq.com>
Date: Thu, 21 Aug 2025 13:47:58 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E9=9F=B3=E9=A2=91?=
 =?UTF-8?q?=E6=92=AD=E6=94=BE=E7=AE=A1=E7=90=86=EF=BC=8C=E6=96=B0=E5=A2=9E?=
 =?UTF-8?q?=20TTSPlayManager=20=E7=B1=BB=EF=BC=8C=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E6=B5=81=E5=BC=8F=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=E9=80=BB?=
 =?UTF-8?q?=E8=BE=91=EF=BC=8C=E6=94=AF=E6=8C=81=20PCM=20=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E5=92=8C=20base64=20=E8=BD=AC=E6=8D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/client/api.ts               |   6 +-
 app/client/platforms/alibaba.ts |  99 +++-------
 app/components/chat.tsx         |  15 +-
 app/utils/audio.ts              | 330 ++++++++++++++++++++++----------
 4 files changed, 266 insertions(+), 184 deletions(-)
diff --git a/app/client/api.ts b/app/client/api.ts
index 9b82959a8..00348548c 100644
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -25,6 +25,7 @@ import { XAIApi } from "./platforms/xai";
 import { ChatGLMApi } from "./platforms/glm";
 import { SiliconflowApi } from "./platforms/siliconflow";
 import { Ai302Api } from "./platforms/ai302";
+import type { TTSPlayManager } from "../utils/audio";
 
 export const ROLES = ["system", "user", "assistant"] as const;
 export type MessageRole = (typeof ROLES)[number];
@@ -108,7 +109,10 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
   abstract chat(options: ChatOptions): Promise<void>;
   abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
-  abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
+  abstract streamSpeech?(
+    options: SpeechOptions,
+    audioManager?: TTSPlayManager,
+  ): AsyncGenerator<AudioBuffer>;
   abstract usage(): Promise<LLMUsage>;
   abstract models(): Promise<LLMModel[]>;
 }
diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts
index fdca6f295..0e7e49e86 100644
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -8,6 +8,7 @@ import {
   usePluginStore,
   FunctionToolItem,
 } from "@/app/store";
+import { TTSPlayManager } from "@/app/utils/audio";
 import {
   preProcessImageContentForAlibabaDashScope,
   streamWithThink,
@@ -62,7 +63,6 @@ interface RequestPayload {
 }
 
 export class QwenApi implements LLMApi {
-  private static audioContext: AudioContext | null = null;
   path(path: string): string {
     const accessStore = useAccessStore.getState();
 
@@ -97,7 +97,10 @@ export class QwenApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
-  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
+  async *streamSpeech(
+    options: SpeechOptions,
+    audioManager?: TTSPlayManager,
+  ): AsyncGenerator<AudioBuffer> {
     if (!options.input || !options.model) {
       throw new Error("Missing required parameters: input and model");
     }
@@ -112,6 +115,10 @@ export class QwenApi implements LLMApi {
     };
     const controller = new AbortController();
     options.onController?.(controller);
+
+    if (audioManager) {
+      audioManager.setStreamController(controller);
+    }
     try {
       const speechPath = this.path(Alibaba.SpeechPath);
       const speechPayload = {
@@ -151,7 +158,10 @@ export class QwenApi implements LLMApi {
             if (line.startsWith("data:")) {
               const json = JSON.parse(data);
               if (json.output?.audio?.data) {
-                yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
+                yield await audioManager!.pcmBase64ToAudioBuffer(
+                  json.output.audio.data,
+                  { channels: 1, sampleRate: 24000, bitDepth: 16 },
+                );
               }
             }
           } catch (parseError) {
@@ -165,8 +175,17 @@ export class QwenApi implements LLMApi {
       }
       reader.releaseLock();
     } catch (e) {
+      // 如果是用户主动取消（AbortError），则不作为错误处理
+      if (e instanceof Error && e.name === "AbortError") {
+        console.log("[Request] Stream speech was aborted by user");
+        return; // 正常退出，不抛出错误
+      }
       console.log("[Request] failed to make a speech request", e);
       throw e;
+    } finally {
+      if (audioManager) {
+        audioManager.clearStreamController();
+      }
     }
   }
 
@@ -356,79 +375,5 @@ export class QwenApi implements LLMApi {
   async models(): Promise<LLMModel[]> {
     return [];
   }
-
-  // 播放 PCM base64 数据
-  private async PCMBase64ToAudioBuffer(base64Data: string) {
-    try {
-      // 解码 base64
-      const binaryString = atob(base64Data);
-      const bytes = new Uint8Array(binaryString.length);
-      for (let i = 0; i < binaryString.length; i++) {
-        bytes[i] = binaryString.charCodeAt(i);
-      }
-
-      // 转换为 AudioBuffer
-      const audioBuffer = await this.convertToAudioBuffer(bytes);
-
-      return audioBuffer;
-    } catch (error) {
-      console.error("播放 PCM 数据失败:", error);
-      throw error;
-    }
-  }
-
-  private static getAudioContext(): AudioContext {
-    if (!QwenApi.audioContext) {
-      QwenApi.audioContext = new (window.AudioContext ||
-        window.webkitAudioContext)();
-    }
-    return QwenApi.audioContext;
-  }
-
-  // 将 PCM 字节数据转换为 AudioBuffer
-  private convertToAudioBuffer(pcmData: Uint8Array) {
-    const audioContext = QwenApi.getAudioContext();
-    const channels = 1;
-    const sampleRate = 24000;
-    return new Promise<AudioBuffer>((resolve, reject) => {
-      try {
-        let float32Array;
-        // 16位 PCM 转换为 32位浮点数
-        float32Array = this.pcm16ToFloat32(pcmData);
-
-        // 创建 AudioBuffer
-        const audioBuffer = audioContext.createBuffer(
-          channels,
-          float32Array.length / channels,
-          sampleRate,
-        );
-
-        // 复制数据到 AudioBuffer
-        for (let channel = 0; channel < channels; channel++) {
-          const channelData = audioBuffer.getChannelData(channel);
-          for (let i = 0; i < channelData.length; i++) {
-            channelData[i] = float32Array[i * channels + channel];
-          }
-        }
-
-        resolve(audioBuffer);
-      } catch (error) {
-        reject(error);
-      }
-    });
-  }
-  // 16位 PCM 转 32位浮点数
-  private pcm16ToFloat32(pcmData: Uint8Array) {
-    const length = pcmData.length / 2;
-    const float32Array = new Float32Array(length);
-
-    for (let i = 0; i < length; i++) {
-      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
-      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
-      float32Array[i] = int16Signed / 32768;
-    }
-
-    return float32Array;
-  }
 }
 export { Alibaba };
diff --git a/app/components/chat.tsx b/app/components/chat.tsx
index 5dbaeaa20..a2575124f 100644
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -1340,12 +1340,15 @@ function _Chat() {
           });
 
           try {
-            for await (const chunk of api.llm.streamSpeech({
-              model: config.ttsConfig.model,
-              input: textContent,
-              voice: config.ttsConfig.voice,
-              speed: config.ttsConfig.speed,
-            })) {
+            for await (const chunk of api.llm.streamSpeech(
+              {
+                model: config.ttsConfig.model,
+                input: textContent,
+                voice: config.ttsConfig.voice,
+                speed: config.ttsConfig.speed,
+              },
+              ttsPlayer,
+            )) {
               ttsPlayer.addToQueue(chunk);
             }
             ttsPlayer.finishStreamPlay();
diff --git a/app/utils/audio.ts b/app/utils/audio.ts
index 3d93f7bad..2f5d4ff80 100644
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@@ -4,157 +4,287 @@ type TTSPlayer = {
     audioBuffer: ArrayBuffer | AudioBuffer,
     onended: () => void | null,
   ) => Promise<void>;
-  playQueue: (
+  playQueueMethod: (
     audioBuffers: (ArrayBuffer | AudioBuffer)[],
     onended: () => void | null,
   ) => Promise<void>;
   addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
   startStreamPlay: (onended: () => void | null) => void;
   finishStreamPlay: () => void;
+  setStreamController: (controller: AbortController) => void;
+  clearStreamController: () => void;
   stop: () => void;
+  pcmBase64ToAudioBuffer: (
+    base64Data: string,
+    config?: PCMConfig,
+  ) => Promise<AudioBuffer>;
+  pcmDataToAudioBuffer: (
+    pcmData: Uint8Array,
+    config?: PCMConfig,
+  ) => Promise<AudioBuffer>;
 };
 
-export function createTTSPlayer(): TTSPlayer {
-  let audioContext: AudioContext | null = null;
-  let audioBufferSourceNode: AudioBufferSourceNode | null = null;
-  let isPlaying = false;
-  let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
-  let currentOnended: (() => void | null) | null = null;
-  let isStreamMode = false;
-  let streamFinished = false;
+// Audio processing utilities
+export interface PCMConfig {
+  channels?: number;
+  sampleRate?: number;
+  bitDepth?: 16 | 24 | 32;
+}
 
-  const init = () => {
-    console.log("[TTSPlayer] init");
-    audioContext = new (window.AudioContext || window.webkitAudioContext)();
-    audioContext.suspend();
-  };
+export class TTSPlayManager implements TTSPlayer {
+  private static audioContext: AudioContext | null = null;
+  private audioBufferSourceNode: AudioBufferSourceNode | null = null;
+  private isPlaying = false;
+  private playQueue: (ArrayBuffer | AudioBuffer)[] = [];
+  private currentOnended: (() => void | null) | null = null;
+  private isStreamMode = false;
+  private streamFinished = false;
+  private streamController: AbortController | null = null;
 
-  const play = async (
+  get getAudioContext() {
+    if (!TTSPlayManager.audioContext) {
+      TTSPlayManager.audioContext = new (window.AudioContext ||
+        window.webkitAudioContext)();
+    }
+    return TTSPlayManager.audioContext;
+  }
+
+  init() {
+    console.log("[TTSPlayManager] init");
+    if (TTSPlayManager.audioContext) {
+      return;
+    }
+    this.getAudioContext.suspend();
+  }
+
+  async play(
     audioBuffer: ArrayBuffer | AudioBuffer,
     onended: () => void | null,
-  ) => {
-    if (audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
+  ) {
+    if (this.audioBufferSourceNode) {
+      this.audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.disconnect();
     }
     let buffer: AudioBuffer;
     if (audioBuffer instanceof AudioBuffer) {
       buffer = audioBuffer;
     } else {
-      buffer = await audioContext!.decodeAudioData(audioBuffer);
+      buffer = await TTSPlayManager.audioContext!.decodeAudioData(audioBuffer);
     }
-    audioBufferSourceNode = audioContext!.createBufferSource();
-    audioBufferSourceNode.buffer = buffer;
-    audioBufferSourceNode.connect(audioContext!.destination);
-    audioContext!.resume().then(() => {
-      audioBufferSourceNode!.start();
+    this.audioBufferSourceNode =
+      TTSPlayManager.audioContext!.createBufferSource();
+    this.audioBufferSourceNode.buffer = buffer;
+    this.audioBufferSourceNode.connect(
+      TTSPlayManager.audioContext!.destination,
+    );
+    this.getAudioContext.resume().then(() => {
+      this.audioBufferSourceNode!.start();
     });
-    audioBufferSourceNode.onended = onended;
-  };
+    this.audioBufferSourceNode.onended = onended;
+  }
 
-  const playNext = async () => {
-    if (playQueue.length === 0) {
+  async stop() {
+    console.log("[TTSPlayer] stop");
+
+    // 首先中断流式请求
+    try {
+      if (this.streamController && !this.streamController.signal.aborted) {
+        console.log("[TTSPlayer] Aborting stream request");
+        this.streamController.abort();
+      }
+    } catch (e) {
+      // 忽略中断请求时的错误
+      console.log("[TTSPlayer] Error while aborting stream:", e);
+    }
+    this.clearStreamController();
+
+    // 清理播放状态
+    this.playQueue = [];
+    this.isPlaying = false;
+    this.isStreamMode = false;
+    this.streamFinished = true;
+    this.currentOnended = null;
+
+    // 停止音频播放
+    if (this.audioBufferSourceNode) {
+      this.audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.disconnect();
+      this.audioBufferSourceNode = null;
+    }
+
+    // 关闭音频上下文
+    if (TTSPlayManager.audioContext) {
+      await TTSPlayManager.audioContext.close();
+      TTSPlayManager.audioContext = null;
+    }
+  }
+
+  async playNext() {
+    if (this.playQueue.length === 0) {
       // 在流模式下，如果队列为空但流还没结束，等待
-      if (isStreamMode && !streamFinished) {
-        setTimeout(() => playNext(), 100);
+      if (this.isStreamMode && !this.streamFinished) {
+        setTimeout(() => this.playNext(), 100);
         return;
       }
 
-      isPlaying = false;
-      isStreamMode = false;
-      streamFinished = false;
-      if (currentOnended) {
-        currentOnended();
-        currentOnended = null;
+      this.isPlaying = false;
+      this.isStreamMode = false;
+      this.streamFinished = false;
+      if (this.currentOnended) {
+        this.currentOnended();
+        this.currentOnended = null;
       }
       return;
     }
 
-    const nextBuffer = playQueue.shift()!;
+    const nextBuffer = this.playQueue.shift()!;
     let buffer: AudioBuffer;
     if (nextBuffer instanceof AudioBuffer) {
       buffer = nextBuffer;
     } else {
-      buffer = await audioContext!.decodeAudioData(nextBuffer);
+      buffer = await this.getAudioContext.decodeAudioData(nextBuffer);
     }
 
-    if (audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
+    if (this.audioBufferSourceNode) {
+      this.audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.disconnect();
     }
 
-    audioBufferSourceNode = audioContext!.createBufferSource();
-    audioBufferSourceNode.buffer = buffer;
-    audioBufferSourceNode.connect(audioContext!.destination);
-    audioBufferSourceNode.onended = () => {
-      playNext();
+    this.audioBufferSourceNode = this.getAudioContext.createBufferSource();
+    this.audioBufferSourceNode.buffer = buffer;
+    this.audioBufferSourceNode.connect(this.getAudioContext.destination);
+    this.audioBufferSourceNode.onended = () => {
+      this.playNext();
     };
 
-    await audioContext!.resume();
-    audioBufferSourceNode.start();
-  };
+    await this.getAudioContext.resume();
+    this.audioBufferSourceNode.start();
+  }
 
-  const playQueueMethod = async (
+  async playQueueMethod(
     audioBuffers: (ArrayBuffer | AudioBuffer)[],
     onended: () => void | null,
-  ) => {
-    playQueue = [...audioBuffers];
-    currentOnended = onended;
-    if (!isPlaying) {
-      isPlaying = true;
-      await playNext();
+  ) {
+    this.playQueue = [...audioBuffers];
+    this.currentOnended = onended;
+    if (!this.isPlaying) {
+      this.isPlaying = true;
+      await this.playNext();
     }
-  };
+  }
 
-  const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
-    if (streamFinished) {
+  addToQueue(audioBuffer: ArrayBuffer | AudioBuffer) {
+    if (this.streamFinished) {
       return;
     }
-    playQueue.push(audioBuffer);
-  };
+    this.playQueue.push(audioBuffer);
+  }
 
-  const startStreamPlay = (onended: () => void | null) => {
-    isStreamMode = true;
-    streamFinished = false;
-    playQueue = [];
-    currentOnended = onended;
-
-    if (!isPlaying) {
-      isPlaying = true;
-      playNext();
+  startStreamPlay(onended: () => void | null) {
+    this.isStreamMode = true;
+    this.streamFinished = false;
+    this.playQueue = [];
+    this.currentOnended = onended;
+    if (!this.isPlaying) {
+      this.isPlaying = true;
+      this.playNext();
     }
-  };
+  }
 
-  const finishStreamPlay = () => {
-    streamFinished = true;
-  };
+  finishStreamPlay() {
+    this.streamFinished = true;
+  }
 
-  const stop = async () => {
-    console.log("[TTSPlayer] stop");
-    playQueue = [];
-    isPlaying = false;
-    isStreamMode = false;
-    streamFinished = true;
-    currentOnended = null;
+  // 设置流式请求控制器，用于在 stop 时中断请求
+  setStreamController(controller: AbortController) {
+    this.streamController = controller;
+  }
 
-    if (audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
-      audioBufferSourceNode = null;
+  // 清除流式请求控制器
+  clearStreamController() {
+    this.streamController = null;
+  }
+
+  // 将 base64 PCM 数据转换为 AudioBuffer
+  async pcmBase64ToAudioBuffer(
+    base64Data: string,
+    config: PCMConfig = {},
+  ): Promise<AudioBuffer> {
+    try {
+      // 解码 base64
+      const binaryString = atob(base64Data);
+      const bytes = new Uint8Array(binaryString.length);
+      for (let i = 0; i < binaryString.length; i++) {
+        bytes[i] = binaryString.charCodeAt(i);
+      }
+
+      // 转换为 AudioBuffer
+      return await this.pcmDataToAudioBuffer(bytes, config);
+    } catch (error) {
+      console.error("Failed to convert PCM base64 to AudioBuffer:", error);
+      throw error;
     }
-    if (audioContext) {
-      await audioContext.close();
-      audioContext = null;
-    }
-  };
+  }
 
-  return {
-    init,
-    play,
-    playQueue: playQueueMethod,
-    addToQueue,
-    startStreamPlay,
-    finishStreamPlay,
-    stop,
-  };
+  // 将 PCM 字节数据转换为 AudioBuffer
+  async pcmDataToAudioBuffer(
+    pcmData: Uint8Array,
+    config: PCMConfig = {},
+  ): Promise<AudioBuffer> {
+    const { channels = 1, sampleRate = 24000, bitDepth = 16 } = config;
+
+    const audioContext = this.getAudioContext;
+
+    return new Promise<AudioBuffer>((resolve, reject) => {
+      try {
+        let float32Array: Float32Array;
+
+        // 根据位深度选择转换方法
+        switch (bitDepth) {
+          case 16:
+            float32Array = this.pcm16ToFloat32(pcmData);
+            break;
+          default:
+            throw new Error(`Unsupported bit depth: ${bitDepth}`);
+        }
+
+        // 创建 AudioBuffer
+        const audioBuffer = audioContext.createBuffer(
+          channels,
+          float32Array.length / channels,
+          sampleRate,
+        );
+
+        // 复制数据到 AudioBuffer
+        for (let channel = 0; channel < channels; channel++) {
+          const channelData = audioBuffer.getChannelData(channel);
+          for (let i = 0; i < channelData.length; i++) {
+            channelData[i] = float32Array[i * channels + channel];
+          }
+        }
+
+        resolve(audioBuffer);
+      } catch (error) {
+        reject(error);
+      }
+    });
+  }
+
+  // 16位 PCM 转 32位浮点数
+  pcm16ToFloat32(pcmData: Uint8Array): Float32Array {
+    const length = pcmData.length / 2;
+    const float32Array = new Float32Array(length);
+
+    for (let i = 0; i < length; i++) {
+      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
+      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
+      float32Array[i] = int16Signed / 32768;
+    }
+
+    return float32Array;
+  }
+}
+
+export function createTTSPlayer(): TTSPlayManager {
+  return new TTSPlayManager();
 }