feat: 更新语音合成接口，支持流式播放和多种音频格式

2026-04-24 03:54:25 +08:00 · 2025-07-30 23:27:49 +08:00
parent 9990a89698
commit c5e6b1278f
4 changed files with 284 additions and 129 deletions
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -1,5 +1,10 @@
 "use client";
-import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant";
+import {
+  ApiPath,
+  Alibaba,
+  ALIBABA_BASE_URL,
+  REQUEST_TIMEOUT_MS,
+} from "@/app/constant";
 import {
  useAccessStore,
  useAppConfig,
@@ -89,66 +94,71 @@ export class QwenApi implements LLMApi {
    return res?.output?.choices?.at(0)?.message?.content ?? "";
  }

-  async speech(options: SpeechOptions): Promise<AudioBuffer> {
+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+
+  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
    const requestPayload = {
-        model: options.model,
-        input: {
-            text: options.input,
-            voice: options.voice,
-        },
-        speed: options.speed,
-        response_format: options.response_format,
+      model: options.model,
+      input: {
+        text: options.input,
+        voice: options.voice,
+      },
+      speed: options.speed,
+      response_format: options.response_format,
    };
    console.log("[Request] alibaba speech payload: ", requestPayload);
    const controller = new AbortController();
    options.onController?.(controller);
    try {
-        const speechPath = this.path(Alibaba.SpeechPath);
-        const speechPayload = {
-          method: "POST",
-          body: JSON.stringify(requestPayload),
-          signal: controller.signal,
-          headers: {
-            ...getHeaders(),
-            "X-DashScope-SSE": "enable",
-          },
-        };
-  
-        // make a fetch request
-        const requestTimeoutId = setTimeout(
-          () => controller.abort(),
-          REQUEST_TIMEOUT_MS,
-        );
-  
-        const res = await fetch(speechPath, speechPayload);
+      const speechPath = this.path(Alibaba.SpeechPath);
+      const speechPayload = {
+        method: "POST",
+        body: JSON.stringify(requestPayload),
+        signal: controller.signal,
+        headers: {
+          ...getHeaders(),
+          "X-DashScope-SSE": "enable",
+        },
+      };

-        const reader = res.body!.getReader();
-        const decoder = new TextDecoder();
-        let buffer = "";
-        let base64 = "";
-        while (true) {
-            const { done, value } = await reader.read();
-            if (done) break;
-            buffer += decoder.decode(value, { stream: true, });
-                const lines = buffer.split('\n');
-                buffer = lines.pop() || '';
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );

-                for (const line of lines) {
-                    if (line.startsWith('data:')) {
-                        const data = line.slice(5);
-                        const json = JSON.parse(data);
-                        base64 += json.output.audio.data;
-                    }
-                }
+      const res = await fetch(speechPath, speechPayload);
+
+      const reader = res.body!.getReader();
+      const decoder = new TextDecoder();
+      let buffer = "";
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) {
+          break;
+        }
+        buffer += decoder.decode(value, { stream: true });
+        const lines = buffer.split("\n");
+        buffer = lines.pop() || "";
+
+        for (const line of lines) {
+          if (line.startsWith("data:")) {
+            const data = line.slice(5);
+            const json = JSON.parse(data);
+            if (json.output.audio.data) {
+              yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
+            }
+          }
        }
-        const audioBuffer = await this.PCMBase64ToAudioBuffer(base64);
-        clearTimeout(requestTimeoutId);
-        reader.releaseLock();
-        return audioBuffer;
-      } catch (e) {
-        console.log("[Request] failed to make a speech request", e);
-        throw e;
      }
+      clearTimeout(requestTimeoutId);
+      reader.releaseLock();
+    } catch (e) {
+      console.log("[Request] failed to make a speech request", e);
+      throw e;
+    }
  }

  async chat(options: ChatOptions) {
@@ -335,67 +345,68 @@ export class QwenApi implements LLMApi {
  // 播放 PCM base64 数据
  private async PCMBase64ToAudioBuffer(base64Data: string) {
    try {
-        // 解码 base64
-        const binaryString = atob(base64Data);
-        const bytes = new Uint8Array(binaryString.length);
-        for (let i = 0; i < binaryString.length; i++) {
-            bytes[i] = binaryString.charCodeAt(i);
-        }
+      // 解码 base64
+      const binaryString = atob(base64Data);
+      const bytes = new Uint8Array(binaryString.length);
+      for (let i = 0; i < binaryString.length; i++) {
+        bytes[i] = binaryString.charCodeAt(i);
+      }

-        // 转换为 AudioBuffer
-        const audioBuffer = await this.convertToAudioBuffer(bytes);
-        
-        return audioBuffer;
+      // 转换为 AudioBuffer
+      const audioBuffer = await this.convertToAudioBuffer(bytes);
+
+      return audioBuffer;
    } catch (error) {
-        console.error('播放 PCM 数据失败:', error);
-        throw error;
+      console.error("播放 PCM 数据失败:", error);
+      throw error;
    }
  }
-  
-   // 将 PCM 字节数据转换为 AudioBuffer
-   private convertToAudioBuffer(pcmData: Uint8Array) {
-    const audioContext = new (window.AudioContext || window.webkitAudioContext)();
+
+  // 将 PCM 字节数据转换为 AudioBuffer
+  private convertToAudioBuffer(pcmData: Uint8Array) {
+    const audioContext = new (window.AudioContext ||
+      window.webkitAudioContext)();
    const channels = 1;
    const sampleRate = 24000;
    return new Promise<AudioBuffer>((resolve, reject) => {
-        try {
-            let float32Array;
-            // 16位 PCM 转换为 32位浮点数
-            float32Array = this.pcm16ToFloat32(pcmData);
+      try {
+        let float32Array;
+        // 16位 PCM 转换为 32位浮点数
+        float32Array = this.pcm16ToFloat32(pcmData);

-            // 创建 AudioBuffer
-            const audioBuffer = audioContext.createBuffer(
-                channels,
-                float32Array.length / channels,
-                sampleRate
-            );
+        // 创建 AudioBuffer
+        const audioBuffer = audioContext.createBuffer(
+          channels,
+          float32Array.length / channels,
+          sampleRate,
+        );

-            // 复制数据到 AudioBuffer
-            for (let channel = 0; channel < channels; channel++) {
-                const channelData = audioBuffer.getChannelData(channel);
-                for (let i = 0; i < channelData.length; i++) {
-                    channelData[i] = float32Array[i * channels + channel];
-                }
-            }
-
-            resolve(audioBuffer);
-        } catch (error) {
-            reject(error);
+        // 复制数据到 AudioBuffer
+        for (let channel = 0; channel < channels; channel++) {
+          const channelData = audioBuffer.getChannelData(channel);
+          for (let i = 0; i < channelData.length; i++) {
+            channelData[i] = float32Array[i * channels + channel];
+          }
        }
+
+        resolve(audioBuffer);
+      } catch (error) {
+        reject(error);
+      }
    });
  }
-    // 16位 PCM 转 32位浮点数
-    private pcm16ToFloat32(pcmData: Uint8Array) {
-        const length = pcmData.length / 2;
-        const float32Array = new Float32Array(length);
-        
-        for (let i = 0; i < length; i++) {
-            const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
-            const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
-            float32Array[i] = int16Signed / 32768;
-        }
-        
-        return float32Array;
+  // 16位 PCM 转 32位浮点数
+  private pcm16ToFloat32(pcmData: Uint8Array) {
+    const length = pcmData.length / 2;
+    const float32Array = new Float32Array(length);
+
+    for (let i = 0; i < length; i++) {
+      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
+      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
+      float32Array[i] = int16Signed / 32768;
    }
+
+    return float32Array;
+  }
 }
-export { Alibaba };
+export { Alibaba };