feat: 更新语音合成接口，支持流式播放和多种音频格式

2025-12-28 02:55:57 +08:00 · 2025-07-30 23:27:49 +08:00
parent 9990a89698
commit c5e6b1278f
4 changed files with 284 additions and 129 deletions
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -107,7 +107,8 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
-  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
  abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -1,5 +1,10 @@
 "use client";
-import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant";
+import {
  ApiPath,
  Alibaba,
  ALIBABA_BASE_URL,
  REQUEST_TIMEOUT_MS,
 } from "@/app/constant";
 import {
  useAccessStore,
  useAppConfig,
@@ -89,66 +94,71 @@ export class QwenApi implements LLMApi {
    return res?.output?.choices?.at(0)?.message?.content ?? "";
  }
-  async speech(options: SpeechOptions): Promise<AudioBuffer> {
+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
    throw new Error("Method not implemented.");
  }
  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
    const requestPayload = {
-        model: options.model,
+      model: options.model,
-        input: {
+      input: {
-            text: options.input,
+        text: options.input,
-            voice: options.voice,
+        voice: options.voice,
-        },
+      },
-        speed: options.speed,
+      speed: options.speed,
-        response_format: options.response_format,
+      response_format: options.response_format,
    };
    console.log("[Request] alibaba speech payload: ", requestPayload);
    const controller = new AbortController();
    options.onController?.(controller);
    try {
-        const speechPath = this.path(Alibaba.SpeechPath);
+      const speechPath = this.path(Alibaba.SpeechPath);
-        const speechPayload = {
+      const speechPayload = {
-          method: "POST",
+        method: "POST",
-          body: JSON.stringify(requestPayload),
+        body: JSON.stringify(requestPayload),
-          signal: controller.signal,
+        signal: controller.signal,
-          headers: {
+        headers: {
-            ...getHeaders(),
+          ...getHeaders(),
-            "X-DashScope-SSE": "enable",
+          "X-DashScope-SSE": "enable",
-          },
+        },
-        };
+      };
        // make a fetch request
        const requestTimeoutId = setTimeout(
          () => controller.abort(),
          REQUEST_TIMEOUT_MS,
        );
        const res = await fetch(speechPath, speechPayload);
-        const reader = res.body!.getReader();
+      // make a fetch request
-        const decoder = new TextDecoder();
+      const requestTimeoutId = setTimeout(
-        let buffer = "";
+        () => controller.abort(),
-        let base64 = "";
+        REQUEST_TIMEOUT_MS,
-        while (true) {
+      );
            const { done, value } = await reader.read();
            if (done) break;
            buffer += decoder.decode(value, { stream: true, });
                const lines = buffer.split('\n');
                buffer = lines.pop() || '';
-                for (const line of lines) {
+      const res = await fetch(speechPath, speechPayload);
-                    if (line.startsWith('data:')) {
+
-                        const data = line.slice(5);
+      const reader = res.body!.getReader();
-                        const json = JSON.parse(data);
+      const decoder = new TextDecoder();
-                        base64 += json.output.audio.data;
+      let buffer = "";
-                    }
+      while (true) {
-                }
+        const { done, value } = await reader.read();
        if (done) {
          break;
        }
        buffer += decoder.decode(value, { stream: true });
        const lines = buffer.split("\n");
        buffer = lines.pop() || "";
        for (const line of lines) {
          if (line.startsWith("data:")) {
            const data = line.slice(5);
            const json = JSON.parse(data);
            if (json.output.audio.data) {
              yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
            }
          }
        }
        const audioBuffer = await this.PCMBase64ToAudioBuffer(base64);
        clearTimeout(requestTimeoutId);
        reader.releaseLock();
        return audioBuffer;
      } catch (e) {
        console.log("[Request] failed to make a speech request", e);
        throw e;
      }
      clearTimeout(requestTimeoutId);
      reader.releaseLock();
    } catch (e) {
      console.log("[Request] failed to make a speech request", e);
      throw e;
    }
  }
  async chat(options: ChatOptions) {
@@ -335,67 +345,68 @@ export class QwenApi implements LLMApi {
  // 播放 PCM base64 数据
  private async PCMBase64ToAudioBuffer(base64Data: string) {
    try {
-        // 解码 base64
+      // 解码 base64
-        const binaryString = atob(base64Data);
+      const binaryString = atob(base64Data);
-        const bytes = new Uint8Array(binaryString.length);
+      const bytes = new Uint8Array(binaryString.length);
-        for (let i = 0; i < binaryString.length; i++) {
+      for (let i = 0; i < binaryString.length; i++) {
-            bytes[i] = binaryString.charCodeAt(i);
+        bytes[i] = binaryString.charCodeAt(i);
-        }
+      }
-        // 转换为 AudioBuffer
+      // 转换为 AudioBuffer
-        const audioBuffer = await this.convertToAudioBuffer(bytes);
+      const audioBuffer = await this.convertToAudioBuffer(bytes);
-        
+
-        return audioBuffer;
+      return audioBuffer;
    } catch (error) {
-        console.error('播放 PCM 数据失败:', error);
+      console.error("播放 PCM 数据失败:", error);
-        throw error;
+      throw error;
    }
  }
-  
+
-   // 将 PCM 字节数据转换为 AudioBuffer
+  // 将 PCM 字节数据转换为 AudioBuffer
-   private convertToAudioBuffer(pcmData: Uint8Array) {
+  private convertToAudioBuffer(pcmData: Uint8Array) {
-    const audioContext = new (window.AudioContext || window.webkitAudioContext)();
+    const audioContext = new (window.AudioContext ||
      window.webkitAudioContext)();
    const channels = 1;
    const sampleRate = 24000;
    return new Promise<AudioBuffer>((resolve, reject) => {
-        try {
+      try {
-            let float32Array;
+        let float32Array;
-            // 16位 PCM 转换为 32位浮点数
+        // 16位 PCM 转换为 32位浮点数
-            float32Array = this.pcm16ToFloat32(pcmData);
+        float32Array = this.pcm16ToFloat32(pcmData);
-            // 创建 AudioBuffer
+        // 创建 AudioBuffer
-            const audioBuffer = audioContext.createBuffer(
+        const audioBuffer = audioContext.createBuffer(
-                channels,
+          channels,
-                float32Array.length / channels,
+          float32Array.length / channels,
-                sampleRate
+          sampleRate,
-            );
+        );
-            // 复制数据到 AudioBuffer
+        // 复制数据到 AudioBuffer
-            for (let channel = 0; channel < channels; channel++) {
+        for (let channel = 0; channel < channels; channel++) {
-                const channelData = audioBuffer.getChannelData(channel);
+          const channelData = audioBuffer.getChannelData(channel);
-                for (let i = 0; i < channelData.length; i++) {
+          for (let i = 0; i < channelData.length; i++) {
-                    channelData[i] = float32Array[i * channels + channel];
+            channelData[i] = float32Array[i * channels + channel];
-                }
+          }
            }
            resolve(audioBuffer);
        } catch (error) {
            reject(error);
        }
        resolve(audioBuffer);
      } catch (error) {
        reject(error);
      }
    });
  }
-    // 16位 PCM 转 32位浮点数
+  // 16位 PCM 转 32位浮点数
-    private pcm16ToFloat32(pcmData: Uint8Array) {
+  private pcm16ToFloat32(pcmData: Uint8Array) {
-        const length = pcmData.length / 2;
+    const length = pcmData.length / 2;
-        const float32Array = new Float32Array(length);
+    const float32Array = new Float32Array(length);
-        
+
-        for (let i = 0; i < length; i++) {
+    for (let i = 0; i < length; i++) {
-            const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
+      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
-            const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
+      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
-            float32Array[i] = int16Signed / 32768;
+      float32Array[i] = int16Signed / 32768;
        }
        return float32Array;
    }
    return float32Array;
  }
 }
-export { Alibaba };
+export { Alibaba };
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -101,8 +101,6 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
  DEFAULT_TTS_ENGINE,
  ModelProvider,
  Path,
  REQUEST_TIMEOUT_MS,
  ServiceProvider,
@@ -1286,6 +1284,7 @@ function _Chat() {
  const accessStore = useAccessStore();
  const [speechStatus, setSpeechStatus] = useState(false);
  const [speechLoading, setSpeechLoading] = useState(false);
  const [speechCooldown, setSpeechCooldown] = useState(false);
  async function openaiSpeech(text: string) {
    if (speechStatus) {
@@ -1297,10 +1296,10 @@ function _Chat() {
      api = new ClientApi(config.ttsConfig.modelProvider);
      setSpeechLoading(true);
      ttsPlayer.init();
-      let audioBuffer: ArrayBuffer;
+      let audioBuffer: ArrayBuffer | AudioBuffer;
      const { markdownToTxt } = require("markdown-to-txt");
      const textContent = markdownToTxt(text);
-      console.log("[OpenAI Speech] textContent: ", config, textContent);
+      console.log("[OpenAI Speech] textContent: ", textContent);
      if (config.ttsConfig.engine === "Edge") {
        const edgeVoiceName = accessStore.edgeVoiceName();
        const tts = new MsEdgeTTS();
@@ -1309,28 +1308,61 @@ function _Chat() {
          OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
        );
        audioBuffer = await tts.toArrayBuffer(textContent);
        playSpeech(audioBuffer);
      } else {
-        audioBuffer = await api.llm.speech({
+        if (api.llm.streamSpeech) {
-          model: config.ttsConfig.model,
+          // 使用流式播放，边接收边播放
-          input: textContent,
+          setSpeechStatus(true);
-          voice: config.ttsConfig.voice,
+          ttsPlayer.startStreamPlay(() => {
-          speed: config.ttsConfig.speed,
+            setSpeechStatus(false);
-        });
+          });
          try {
            for await (const chunk of api.llm.streamSpeech({
              model: config.ttsConfig.model,
              input: textContent,
              voice: config.ttsConfig.voice,
              speed: config.ttsConfig.speed,
            })) {
              console.log("[Stream Speech] add to queue", chunk);
              ttsPlayer.addToQueue(chunk);
            }
            ttsPlayer.finishStreamPlay();
          } catch (e) {
            console.error("[Stream Speech]", e);
            showToast(prettyObject(e));
            setSpeechStatus(false);
            ttsPlayer.stop();
          } finally {
            setSpeechLoading(false);
          }
        } else {
          audioBuffer = await api.llm.speech({
            model: config.ttsConfig.model,
            input: textContent,
            voice: config.ttsConfig.voice,
            speed: config.ttsConfig.speed,
          });
          playSpeech(audioBuffer);
        }
      }
      setSpeechStatus(true);
      ttsPlayer
        .play(audioBuffer, () => {
          setSpeechStatus(false);
        })
        .catch((e) => {
          console.error("[OpenAI Speech]", e);
          showToast(prettyObject(e));
          setSpeechStatus(false);
        })
        .finally(() => setSpeechLoading(false));
    }
  }
  function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) {
    setSpeechStatus(true);
    ttsPlayer
      .play(audioBuffer, () => {
        setSpeechStatus(false);
      })
      .catch((e) => {
        console.error("[OpenAI Speech]", e);
        showToast(prettyObject(e));
        setSpeechStatus(false);
      })
      .finally(() => setSpeechLoading(false));
  }
  const context: RenderMessage[] = useMemo(() => {
    return session.mask.hideContext ? [] : session.mask.context.slice();
  }, [session.mask.context, session.mask.hideContext]);
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@@ -1,19 +1,38 @@
 type TTSPlayer = {
  init: () => void;
-  play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
+  play: (
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
  ) => Promise<void>;
  playQueue: (
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
  ) => Promise<void>;
  addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
  startStreamPlay: (onended: () => void | null) => void;
  finishStreamPlay: () => void;
  stop: () => void;
 };
 export function createTTSPlayer(): TTSPlayer {
  let audioContext: AudioContext | null = null;
  let audioBufferSourceNode: AudioBufferSourceNode | null = null;
  let isPlaying = false;
  let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
  let currentOnended: (() => void | null) | null = null;
  let isStreamMode = false;
  let streamFinished = false;
  const init = () => {
    console.log("[TTSPlayer] init");
    audioContext = new (window.AudioContext || window.webkitAudioContext)();
    audioContext.suspend();
  };
-  const play = async (audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null) => {
+  const play = async (
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
  ) => {
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
@@ -33,17 +52,109 @@ export function createTTSPlayer(): TTSPlayer {
    audioBufferSourceNode.onended = onended;
  };
-  const stop = () => {
+  const playNext = async () => {
    if (playQueue.length === 0) {
      // 在流模式下，如果队列为空但流还没结束，等待
      if (isStreamMode && !streamFinished) {
        setTimeout(() => playNext(), 100);
        return;
      }
      isPlaying = false;
      isStreamMode = false;
      streamFinished = false;
      if (currentOnended) {
        currentOnended();
        currentOnended = null;
      }
      return;
    }
    const nextBuffer = playQueue.shift()!;
    let buffer: AudioBuffer;
    if (nextBuffer instanceof AudioBuffer) {
      buffer = nextBuffer;
    } else {
      buffer = await audioContext!.decodeAudioData(nextBuffer);
    }
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
    }
    audioBufferSourceNode = audioContext!.createBufferSource();
    audioBufferSourceNode.buffer = buffer;
    audioBufferSourceNode.connect(audioContext!.destination);
    audioBufferSourceNode.onended = () => {
      playNext();
    };
    await audioContext!.resume();
    audioBufferSourceNode.start();
  };
  const playQueueMethod = async (
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
  ) => {
    playQueue = [...audioBuffers];
    currentOnended = onended;
    if (!isPlaying) {
      isPlaying = true;
      await playNext();
    }
  };
  const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
    if (streamFinished) {
      return;
    }
    playQueue.push(audioBuffer);
  };
  const startStreamPlay = (onended: () => void | null) => {
    isStreamMode = true;
    streamFinished = false;
    playQueue = [];
    currentOnended = onended;
    if (!isPlaying) {
      isPlaying = true;
      playNext();
    }
  };
  const finishStreamPlay = () => {
    streamFinished = true;
  };
  const stop = async () => {
    console.log("[TTSPlayer] stop");
    playQueue = [];
    isPlaying = false;
    isStreamMode = false;
    streamFinished = true;
    currentOnended = null;
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
      audioBufferSourceNode = null;
    }
    if (audioContext) {
-      audioContext.close();
+      await audioContext.close();
      audioContext = null;
    }
  };
-  return { init, play, stop };
+  return {
-}
+    init,
    play,
    playQueue: playQueueMethod,
    addToQueue,
    startStreamPlay,
    finishStreamPlay,
    stop,
  };
 }