feat: 添加 TTS 引擎配置，更新阿里巴巴语音接口，支持实时语音合成

2025-09-21 18:56:37 +08:00 · 2025-07-30 21:30:49 +08:00 · 2025-07-30 21:30:49 +08:00 · 9990a89698
commit 9990a89698
parent 557a2cce35
9 changed files with 241 additions and 45 deletions
--- a/.yarnrc.yml
+++ b/.yarnrc.yml
@ -0,0 +1 @@
+nodeLinker: node-modules
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@ -1,5 +1,5 @@
 "use client";
-import { ApiPath, Alibaba, ALIBABA_BASE_URL } from "@/app/constant";
+import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant";
 import {
  useAccessStore,
  useAppConfig,
@ -89,8 +89,66 @@ export class QwenApi implements LLMApi {
    return res?.output?.choices?.at(0)?.message?.content ?? "";
  }

-  speech(options: SpeechOptions): Promise<ArrayBuffer> {
-    throw new Error("Method not implemented.");
+  async speech(options: SpeechOptions): Promise<AudioBuffer> {
+    const requestPayload = {
+        model: options.model,
+        input: {
+            text: options.input,
+            voice: options.voice,
+        },
+        speed: options.speed,
+        response_format: options.response_format,
+    };
+    console.log("[Request] alibaba speech payload: ", requestPayload);
+    const controller = new AbortController();
+    options.onController?.(controller);
+    try {
+        const speechPath = this.path(Alibaba.SpeechPath);
+        const speechPayload = {
+          method: "POST",
+          body: JSON.stringify(requestPayload),
+          signal: controller.signal,
+          headers: {
+            ...getHeaders(),
+            "X-DashScope-SSE": "enable",
+          },
+        };
+  
+        // make a fetch request
+        const requestTimeoutId = setTimeout(
+          () => controller.abort(),
+          REQUEST_TIMEOUT_MS,
+        );
+  
+        const res = await fetch(speechPath, speechPayload);
+
+        const reader = res.body!.getReader();
+        const decoder = new TextDecoder();
+        let buffer = "";
+        let base64 = "";
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            buffer += decoder.decode(value, { stream: true, });
+                const lines = buffer.split('\n');
+                buffer = lines.pop() || '';
+
+                for (const line of lines) {
+                    if (line.startsWith('data:')) {
+                        const data = line.slice(5);
+                        const json = JSON.parse(data);
+                        base64 += json.output.audio.data;
+                    }
+                }
+        }
+        const audioBuffer = await this.PCMBase64ToAudioBuffer(base64);
+        clearTimeout(requestTimeoutId);
+        reader.releaseLock();
+        return audioBuffer;
+      } catch (e) {
+        console.log("[Request] failed to make a speech request", e);
+        throw e;
+      }
  }

  async chat(options: ChatOptions) {
@ -273,5 +331,71 @@ export class QwenApi implements LLMApi {
  async models(): Promise<LLMModel[]> {
    return [];
  }
+
+  // 播放 PCM base64 数据
+  private async PCMBase64ToAudioBuffer(base64Data: string) {
+    try {
+        // 解码 base64
+        const binaryString = atob(base64Data);
+        const bytes = new Uint8Array(binaryString.length);
+        for (let i = 0; i < binaryString.length; i++) {
+            bytes[i] = binaryString.charCodeAt(i);
+        }
+
+        // 转换为 AudioBuffer
+        const audioBuffer = await this.convertToAudioBuffer(bytes);
+        
+        return audioBuffer;
+    } catch (error) {
+        console.error('播放 PCM 数据失败:', error);
+        throw error;
+    }
+  }
+  
+   // 将 PCM 字节数据转换为 AudioBuffer
+   private convertToAudioBuffer(pcmData: Uint8Array) {
+    const audioContext = new (window.AudioContext || window.webkitAudioContext)();
+    const channels = 1;
+    const sampleRate = 24000;
+    return new Promise<AudioBuffer>((resolve, reject) => {
+        try {
+            let float32Array;
+            // 16位 PCM 转换为 32位浮点数
+            float32Array = this.pcm16ToFloat32(pcmData);
+
+            // 创建 AudioBuffer
+            const audioBuffer = audioContext.createBuffer(
+                channels,
+                float32Array.length / channels,
+                sampleRate
+            );
+
+            // 复制数据到 AudioBuffer
+            for (let channel = 0; channel < channels; channel++) {
+                const channelData = audioBuffer.getChannelData(channel);
+                for (let i = 0; i < channelData.length; i++) {
+                    channelData[i] = float32Array[i * channels + channel];
+                }
+            }
+
+            resolve(audioBuffer);
+        } catch (error) {
+            reject(error);
+        }
+    });
+  }
+    // 16位 PCM 转 32位浮点数
+    private pcm16ToFloat32(pcmData: Uint8Array) {
+        const length = pcmData.length / 2;
+        const float32Array = new Float32Array(length);
+        
+        for (let i = 0; i < length; i++) {
+            const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
+            const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
+            float32Array[i] = int16Signed / 32768;
+        }
+        
+        return float32Array;
+    }
 }
 export { Alibaba };
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@ -1293,14 +1293,15 @@ function _Chat() {
      setSpeechStatus(false);
    } else {
      var api: ClientApi;
-      api = new ClientApi(ModelProvider.GPT);
      const config = useAppConfig.getState();
+      api = new ClientApi(config.ttsConfig.modelProvider);
      setSpeechLoading(true);
      ttsPlayer.init();
      let audioBuffer: ArrayBuffer;
      const { markdownToTxt } = require("markdown-to-txt");
      const textContent = markdownToTxt(text);
-      if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
+      console.log("[OpenAI Speech] textContent: ", config, textContent);
+      if (config.ttsConfig.engine === "Edge") {
        const edgeVoiceName = accessStore.edgeVoiceName();
        const tts = new MsEdgeTTS();
        await tts.setMetadata(
--- a/app/components/realtime-chat/realtime-config.tsx
+++ b/app/components/realtime-chat/realtime-config.tsx
@ -5,13 +5,13 @@ import { ListItem, Select, PasswordInput } from "@/app/components/ui-lib";

 import { InputRange } from "@/app/components/input-range";
 import { Voice } from "rt-client";
-import { ServiceProvider } from "@/app/constant";
+import { REALTIME_TTS_MODELS, ServiceProvider } from "@/app/constant";

-const providers = [ServiceProvider.OpenAI, ServiceProvider.Azure];
+const providers = Object.keys(REALTIME_TTS_MODELS) as ServiceProvider[];

-const models = ["gpt-4o-realtime-preview-2024-10-01"];
+const models = ["gpt-4o-realtime-preview-2024-10-01", "qwen-tts-realtime"];

-const voice = ["alloy", "shimmer", "echo"];
+const voice = ["alloy", "shimmer", "echo","Chelsie"];

 export function RealtimeConfigList(props: {
  realtimeConfig: RealtimeConfig;
--- a/app/components/tts-config.tsx
+++ b/app/components/tts-config.tsx
@ -3,10 +3,9 @@ import { TTSConfig, TTSConfigValidator } from "../store";
 import Locale from "../locales";
 import { ListItem, Select } from "./ui-lib";
 import {
-  DEFAULT_TTS_ENGINE,
-  DEFAULT_TTS_ENGINES,
-  DEFAULT_TTS_MODELS,
-  DEFAULT_TTS_VOICES,
+    ServiceProvider,
+    TTS_CONFIGS,
+    TTSEngineType
 } from "../constant";
 import { InputRange } from "./input-range";

@ -48,22 +47,33 @@ export function TTSConfigList(props: {
        <Select
          value={props.ttsConfig.engine}
          onChange={(e) => {
+            const newEngine = e.currentTarget.value as TTSEngineType;
            props.updateConfig(
-              (config) =>
-                (config.engine = TTSConfigValidator.engine(
-                  e.currentTarget.value,
-                )),
+              (config) => {
+                config.engine = TTSConfigValidator.engine(newEngine);
+                const engineConfig = TTS_CONFIGS[newEngine];
+                config.model = TTSConfigValidator.model(
+                    engineConfig.Model[0] || ""
+                );
+                config.voice = TTSConfigValidator.voice(
+                    engineConfig.Voices[0] || ""
+                );
+                config.modelProvider = TTSConfigValidator.modelProvider(
+                    engineConfig.ModelProvider
+                );
+              }
            );
          }}
        >
-          {DEFAULT_TTS_ENGINES.map((v, i) => (
+          {Object.keys(TTS_CONFIGS).map((v, i) => (
            <option value={v} key={i}>
-              {v}
+              {v}-TTS
            </option>
          ))}
        </Select>
      </ListItem>
-      {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
+      {(props.ttsConfig.engine === ServiceProvider.OpenAI || 
+        props.ttsConfig.engine === ServiceProvider.Alibaba) && (
        <>
          <ListItem title={Locale.Settings.TTS.Model}>
            <Select
@ -77,7 +87,7 @@ export function TTSConfigList(props: {
                );
              }}
            >
-              {DEFAULT_TTS_MODELS.map((v, i) => (
+              {TTS_CONFIGS[props.ttsConfig.engine]!.Model.map((v, i) => (
                <option value={v} key={i}>
                  {v}
                </option>
@ -99,7 +109,7 @@ export function TTSConfigList(props: {
                );
              }}
            >
-              {DEFAULT_TTS_VOICES.map((v, i) => (
+              {TTS_CONFIGS[props.ttsConfig.engine]!.Voices.map((v, i) => (
                <option value={v} key={i}>
                  {v}
                </option>
--- a/app/constant.ts
+++ b/app/constant.ts
@ -22,6 +22,7 @@ export const BAIDU_OATUH_URL = `${BAIDU_BASE_URL}/oauth/2.0/token`;
 export const BYTEDANCE_BASE_URL = "https://ark.cn-beijing.volces.com";

 export const ALIBABA_BASE_URL = "https://dashscope.aliyuncs.com/api/";
+export const ALIBABA_RUNTIEM_TTS_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime";

 export const TENCENT_BASE_URL = "https://hunyuan.tencentcloudapi.com";

@ -232,6 +233,7 @@ export const Alibaba = {
    }
    return `v1/services/aigc/text-generation/generation`;
  },
+  SpeechPath: "v1/services/aigc/multimodal-generation/generation",
 };

 export const Tencent = {
@ -461,19 +463,53 @@ export const KnowledgeCutOffDate: Record<string, string> = {
  "deepseek-coder": "2024-07",
 };

-export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
-export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
+export const DEFAULT_TTS_ENGINE = ServiceProvider.OpenAI;
 export const DEFAULT_TTS_MODEL = "tts-1";
 export const DEFAULT_TTS_VOICE = "alloy";
-export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
-export const DEFAULT_TTS_VOICES = [
-  "alloy",
-  "echo",
-  "fable",
-  "onyx",
-  "nova",
-  "shimmer",
-];
+
+export const OPENAI_TTS = {
+    Provider: ServiceProvider.OpenAI,
+    ModelProvider: ModelProvider.GPT,
+    Model: ["tts-1", "tts-1-hd"],
+    Voices: ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
+} as const;
+
+export const ALIBABA_TTS = {
+    Provider: ServiceProvider.Alibaba,
+    ModelProvider: ModelProvider.Qwen,
+    Model: ["qwen-tts", "qwen-tts-latest"],
+    Voices: ["Chelsie", "Cherry", "Ethan", "Serena", "Dylan", "Jada", "Sunny"],
+} as const;
+
+export const EDGE_TTS = {
+    Provider: "Edge" as const,
+    ModelProvider: ModelProvider.GPT,
+    Model: [] as string[],
+    Voices: [] as string[],
+} as const;
+
+// 定义支持的 TTS 引擎类型
+export type TTSEngineType = ServiceProvider.OpenAI | ServiceProvider.Alibaba | "Edge";
+
+// 从 TTS_CONFIGS 中提取所有可用的引擎、模型和声音
+export const DEFAULT_TTS_ENGINES = [ServiceProvider.OpenAI, ServiceProvider.Alibaba, "Edge"] as const;
+export const DEFAULT_TTS_MODELS = [...OPENAI_TTS.Model, ...ALIBABA_TTS.Model] as const;
+export const DEFAULT_TTS_VOICES = [...OPENAI_TTS.Voices, ...ALIBABA_TTS.Voices] as const;
+
+// TTS 配置接口
+interface TTSConfigItem {
+    Provider: ServiceProvider | "Edge";
+    Model: readonly string[];
+    Voices: readonly string[];
+    ModelProvider: ModelProvider;
+}
+
+// 使用完整的 Record 而不是 Partial，确保类型安全
+export const TTS_CONFIGS: Record<TTSEngineType, TTSConfigItem> = {
+    [ServiceProvider.OpenAI]: OPENAI_TTS,
+    [ServiceProvider.Alibaba]: ALIBABA_TTS,
+    Edge: EDGE_TTS,
+} as const;

 export const VISION_MODEL_REGEXES = [
  /vision/,
@ -497,6 +533,16 @@ export const VISION_MODEL_REGEXES = [

 export const EXCLUDE_VISION_MODEL_REGEXES = [/claude-3-5-haiku-20241022/];

+export const RUNTIME_TTS_OPENAI = {
+    ExampleEndpoint: XAI_BASE_URL,
+}
+
+export const REALTIME_TTS_MODELS = {
+    [ServiceProvider.OpenAI]: ["gpt-4o-realtime-preview-2024-10-01"],
+    [ServiceProvider.Azure]: ["gpt-4o-realtime-preview-2024-10-01"],
+    [ServiceProvider.Alibaba]: ["qwen-omni-turbo-realtime"],
+};
+
 const openaiModels = [
  // As of July 2024, gpt-4o-mini should be used in place of gpt-3.5-turbo,
  // as it is cheaper, more capable, multimodal, and just as fast. gpt-3.5-turbo is still available for use in the API.
@ -920,3 +966,4 @@ export const DEFAULT_GA_ID = "G-89WN60ZK2E";

 export const SAAS_CHAT_URL = "https://nextchat.club";
 export const SAAS_CHAT_UTM_URL = "https://nextchat.club?utm=github";
+
--- a/app/store/config.ts
+++ b/app/store/config.ts
@ -13,6 +13,8 @@ import {
  DEFAULT_TTS_VOICES,
  StoreKey,
  ServiceProvider,
+  TTSEngineType,
+  ModelProvider,
 } from "../constant";
 import { createPersistStore } from "../utils/store";
 import type { Voice } from "rt-client";
@ -20,7 +22,6 @@ import type { Voice } from "rt-client";
 export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
 export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
 export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
-export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];

 export enum SubmitKey {
  Enter = "Enter",
@ -86,7 +87,8 @@ export const DEFAULT_CONFIG = {
  ttsConfig: {
    enable: false,
    autoplay: false,
-    engine: DEFAULT_TTS_ENGINE,
+    modelProvider: ModelProvider.GPT,
+    engine: DEFAULT_TTS_ENGINE as TTSEngineType,
    model: DEFAULT_TTS_MODEL,
    voice: DEFAULT_TTS_VOICE,
    speed: 1.0,
@ -126,18 +128,21 @@ export function limitNumber(
 }

 export const TTSConfigValidator = {
-  engine(x: string) {
+  engine(x: string | TTSEngineType): TTSEngineType {
    return x as TTSEngineType;
  },
-  model(x: string) {
+  model(x: string): TTSModelType {
    return x as TTSModelType;
  },
-  voice(x: string) {
+  voice(x: string): TTSVoiceType {
    return x as TTSVoiceType;
  },
-  speed(x: number) {
+  speed(x: number): number {
    return limitNumber(x, 0.25, 4.0, 1.0);
  },
+  modelProvider(x: string): ModelProvider {
+    return x as ModelProvider;
+  },
 };

 export const ModalConfigValidator = {
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@ -13,13 +13,17 @@ export function createTTSPlayer(): TTSPlayer {
    audioContext.suspend();
  };

-  const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => {
+  const play = async (audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null) => {
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
    }
-
-    const buffer = await audioContext!.decodeAudioData(audioBuffer);
+    let buffer: AudioBuffer;
+    if (audioBuffer instanceof AudioBuffer) {
+      buffer = audioBuffer;
+    } else {
+      buffer = await audioContext!.decodeAudioData(audioBuffer);
+    }
    audioBufferSourceNode = audioContext!.createBufferSource();
    audioBufferSourceNode.buffer = buffer;
    audioBufferSourceNode.connect(audioContext!.destination);
--- a/package.json
+++ b/package.json
@ -93,5 +93,9 @@
  "resolutions": {
    "lint-staged/yaml": "^2.2.2"
  },
-  "packageManager": "yarn@1.22.19"
+  "packageManager": "yarn@1.22.19",
+  "volta": {
+    "node": "20.19.4",
+    "yarn": "1.22.22"
+  }
 }