feat: 更新语音合成接口，支持流式播放和多种音频格式

2025-12-28 19:15:58 +08:00 · 2025-07-30 23:27:49 +08:00
parent 9990a89698
commit c5e6b1278f
4 changed files with 284 additions and 129 deletions
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -107,7 +107,8 @@ export interface LLMModelProvider {

 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
-  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
+  abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -1,5 +1,10 @@
 "use client";
-import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant";
+import {
+  ApiPath,
+  Alibaba,
+  ALIBABA_BASE_URL,
+  REQUEST_TIMEOUT_MS,
+} from "@/app/constant";
 import {
  useAccessStore,
  useAppConfig,
@@ -89,7 +94,11 @@ export class QwenApi implements LLMApi {
    return res?.output?.choices?.at(0)?.message?.content ?? "";
  }

-  async speech(options: SpeechOptions): Promise<AudioBuffer> {
+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+
+  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
    const requestPayload = {
      model: options.model,
      input: {
@@ -125,26 +134,27 @@ export class QwenApi implements LLMApi {
      const reader = res.body!.getReader();
      const decoder = new TextDecoder();
      let buffer = "";
-        let base64 = "";
      while (true) {
        const { done, value } = await reader.read();
-            if (done) break;
-            buffer += decoder.decode(value, { stream: true, });
-                const lines = buffer.split('\n');
-                buffer = lines.pop() || '';
+        if (done) {
+          break;
+        }
+        buffer += decoder.decode(value, { stream: true });
+        const lines = buffer.split("\n");
+        buffer = lines.pop() || "";

        for (const line of lines) {
-                    if (line.startsWith('data:')) {
+          if (line.startsWith("data:")) {
            const data = line.slice(5);
            const json = JSON.parse(data);
-                        base64 += json.output.audio.data;
+            if (json.output.audio.data) {
+              yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
+            }
          }
        }
      }
-        const audioBuffer = await this.PCMBase64ToAudioBuffer(base64);
      clearTimeout(requestTimeoutId);
      reader.releaseLock();
-        return audioBuffer;
    } catch (e) {
      console.log("[Request] failed to make a speech request", e);
      throw e;
@@ -347,14 +357,15 @@ export class QwenApi implements LLMApi {

      return audioBuffer;
    } catch (error) {
-        console.error('播放 PCM 数据失败:', error);
+      console.error("播放 PCM 数据失败:", error);
      throw error;
    }
  }

  // 将 PCM 字节数据转换为 AudioBuffer
  private convertToAudioBuffer(pcmData: Uint8Array) {
-    const audioContext = new (window.AudioContext || window.webkitAudioContext)();
+    const audioContext = new (window.AudioContext ||
+      window.webkitAudioContext)();
    const channels = 1;
    const sampleRate = 24000;
    return new Promise<AudioBuffer>((resolve, reject) => {
@@ -367,7 +378,7 @@ export class QwenApi implements LLMApi {
        const audioBuffer = audioContext.createBuffer(
          channels,
          float32Array.length / channels,
-                sampleRate
+          sampleRate,
        );

        // 复制数据到 AudioBuffer
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -101,8 +101,6 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
-  DEFAULT_TTS_ENGINE,
-  ModelProvider,
  Path,
  REQUEST_TIMEOUT_MS,
  ServiceProvider,
@@ -1286,6 +1284,7 @@ function _Chat() {
  const accessStore = useAccessStore();
  const [speechStatus, setSpeechStatus] = useState(false);
  const [speechLoading, setSpeechLoading] = useState(false);
+  const [speechCooldown, setSpeechCooldown] = useState(false);

  async function openaiSpeech(text: string) {
    if (speechStatus) {
@@ -1297,10 +1296,10 @@ function _Chat() {
      api = new ClientApi(config.ttsConfig.modelProvider);
      setSpeechLoading(true);
      ttsPlayer.init();
-      let audioBuffer: ArrayBuffer;
+      let audioBuffer: ArrayBuffer | AudioBuffer;
      const { markdownToTxt } = require("markdown-to-txt");
      const textContent = markdownToTxt(text);
-      console.log("[OpenAI Speech] textContent: ", config, textContent);
+      console.log("[OpenAI Speech] textContent: ", textContent);
      if (config.ttsConfig.engine === "Edge") {
        const edgeVoiceName = accessStore.edgeVoiceName();
        const tts = new MsEdgeTTS();
@@ -1309,6 +1308,34 @@ function _Chat() {
          OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
        );
        audioBuffer = await tts.toArrayBuffer(textContent);
+        playSpeech(audioBuffer);
+      } else {
+        if (api.llm.streamSpeech) {
+          // 使用流式播放，边接收边播放
+          setSpeechStatus(true);
+          ttsPlayer.startStreamPlay(() => {
+            setSpeechStatus(false);
+          });
+
+          try {
+            for await (const chunk of api.llm.streamSpeech({
+              model: config.ttsConfig.model,
+              input: textContent,
+              voice: config.ttsConfig.voice,
+              speed: config.ttsConfig.speed,
+            })) {
+              console.log("[Stream Speech] add to queue", chunk);
+              ttsPlayer.addToQueue(chunk);
+            }
+            ttsPlayer.finishStreamPlay();
+          } catch (e) {
+            console.error("[Stream Speech]", e);
+            showToast(prettyObject(e));
+            setSpeechStatus(false);
+            ttsPlayer.stop();
+          } finally {
+            setSpeechLoading(false);
+          }
        } else {
          audioBuffer = await api.llm.speech({
            model: config.ttsConfig.model,
@@ -1316,7 +1343,13 @@ function _Chat() {
            voice: config.ttsConfig.voice,
            speed: config.ttsConfig.speed,
          });
+          playSpeech(audioBuffer);
        }
+      }
+    }
+  }
+
+  function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) {
    setSpeechStatus(true);
    ttsPlayer
      .play(audioBuffer, () => {
@@ -1329,7 +1362,6 @@ function _Chat() {
      })
      .finally(() => setSpeechLoading(false));
  }
-  }

  const context: RenderMessage[] = useMemo(() => {
    return session.mask.hideContext ? [] : session.mask.context.slice();
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@@ -1,19 +1,38 @@
 type TTSPlayer = {
  init: () => void;
-  play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
+  play: (
+    audioBuffer: ArrayBuffer | AudioBuffer,
+    onended: () => void | null,
+  ) => Promise<void>;
+  playQueue: (
+    audioBuffers: (ArrayBuffer | AudioBuffer)[],
+    onended: () => void | null,
+  ) => Promise<void>;
+  addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
+  startStreamPlay: (onended: () => void | null) => void;
+  finishStreamPlay: () => void;
  stop: () => void;
 };

 export function createTTSPlayer(): TTSPlayer {
  let audioContext: AudioContext | null = null;
  let audioBufferSourceNode: AudioBufferSourceNode | null = null;
+  let isPlaying = false;
+  let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
+  let currentOnended: (() => void | null) | null = null;
+  let isStreamMode = false;
+  let streamFinished = false;

  const init = () => {
+    console.log("[TTSPlayer] init");
    audioContext = new (window.AudioContext || window.webkitAudioContext)();
    audioContext.suspend();
  };

-  const play = async (audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null) => {
+  const play = async (
+    audioBuffer: ArrayBuffer | AudioBuffer,
+    onended: () => void | null,
+  ) => {
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
@@ -33,17 +52,109 @@ export function createTTSPlayer(): TTSPlayer {
    audioBufferSourceNode.onended = onended;
  };

-  const stop = () => {
+  const playNext = async () => {
+    if (playQueue.length === 0) {
+      // 在流模式下，如果队列为空但流还没结束，等待
+      if (isStreamMode && !streamFinished) {
+        setTimeout(() => playNext(), 100);
+        return;
+      }
+
+      isPlaying = false;
+      isStreamMode = false;
+      streamFinished = false;
+      if (currentOnended) {
+        currentOnended();
+        currentOnended = null;
+      }
+      return;
+    }
+
+    const nextBuffer = playQueue.shift()!;
+    let buffer: AudioBuffer;
+    if (nextBuffer instanceof AudioBuffer) {
+      buffer = nextBuffer;
+    } else {
+      buffer = await audioContext!.decodeAudioData(nextBuffer);
+    }
+
+    if (audioBufferSourceNode) {
+      audioBufferSourceNode.stop();
+      audioBufferSourceNode.disconnect();
+    }
+
+    audioBufferSourceNode = audioContext!.createBufferSource();
+    audioBufferSourceNode.buffer = buffer;
+    audioBufferSourceNode.connect(audioContext!.destination);
+    audioBufferSourceNode.onended = () => {
+      playNext();
+    };
+
+    await audioContext!.resume();
+    audioBufferSourceNode.start();
+  };
+
+  const playQueueMethod = async (
+    audioBuffers: (ArrayBuffer | AudioBuffer)[],
+    onended: () => void | null,
+  ) => {
+    playQueue = [...audioBuffers];
+    currentOnended = onended;
+    if (!isPlaying) {
+      isPlaying = true;
+      await playNext();
+    }
+  };
+
+  const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
+    if (streamFinished) {
+      return;
+    }
+    playQueue.push(audioBuffer);
+  };
+
+  const startStreamPlay = (onended: () => void | null) => {
+    isStreamMode = true;
+    streamFinished = false;
+    playQueue = [];
+    currentOnended = onended;
+
+    if (!isPlaying) {
+      isPlaying = true;
+      playNext();
+    }
+  };
+
+  const finishStreamPlay = () => {
+    streamFinished = true;
+  };
+
+  const stop = async () => {
+    console.log("[TTSPlayer] stop");
+    playQueue = [];
+    isPlaying = false;
+    isStreamMode = false;
+    streamFinished = true;
+    currentOnended = null;
+
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
      audioBufferSourceNode = null;
    }
    if (audioContext) {
-      audioContext.close();
+      await audioContext.close();
      audioContext = null;
    }
  };

-  return { init, play, stop };
+  return {
+    init,
+    play,
+    playQueue: playQueueMethod,
+    addToQueue,
+    startStreamPlay,
+    finishStreamPlay,
+    stop,
+  };
 }