Merge fe484fd38a into 557a2cce35

2025-10-01 07:36:39 +08:00 · 2025-07-31 03:12:58 +00:00 · 2025-07-31 03:12:58 +00:00 · d6b770cb9b
commit d6b770cb9b
parent 557a2cce35 fe484fd38a
9 changed files with 400 additions and 63 deletions
--- a/.yarnrc.yml
+++ b/.yarnrc.yml
@ -0,0 +1 @@
 nodeLinker: node-modules
--- a/app/client/api.ts
+++ b/app/client/api.ts
@ -107,7 +107,8 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
-  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
  abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@ -1,5 +1,10 @@
 "use client";
-import { ApiPath, Alibaba, ALIBABA_BASE_URL } from "@/app/constant";
+import {
  ApiPath,
  Alibaba,
  ALIBABA_BASE_URL,
  REQUEST_TIMEOUT_MS,
 } from "@/app/constant";
 import {
  useAccessStore,
  useAppConfig,
@ -59,6 +64,7 @@ interface RequestPayload {
 }
 export class QwenApi implements LLMApi {
  private audioContext?: AudioContext;
  path(path: string): string {
    const accessStore = useAccessStore.getState();
@ -89,10 +95,72 @@ export class QwenApi implements LLMApi {
    return res?.output?.choices?.at(0)?.message?.content ?? "";
  }
-  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
    throw new Error("Method not implemented.");
  }
  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
    const requestPayload = {
      model: options.model,
      input: {
        text: options.input,
        voice: options.voice,
      },
      speed: options.speed,
      response_format: options.response_format,
    };
    const controller = new AbortController();
    options.onController?.(controller);
    try {
      const speechPath = this.path(Alibaba.SpeechPath);
      const speechPayload = {
        method: "POST",
        body: JSON.stringify(requestPayload),
        signal: controller.signal,
        headers: {
          ...getHeaders(),
          "X-DashScope-SSE": "enable",
        },
      };
      // make a fetch request
      const requestTimeoutId = setTimeout(
        () => controller.abort(),
        REQUEST_TIMEOUT_MS,
      );
      const res = await fetch(speechPath, speechPayload);
      const reader = res.body!.getReader();
      const decoder = new TextDecoder();
      let buffer = "";
      while (true) {
        const { done, value } = await reader.read();
        if (done) {
          break;
        }
        buffer += decoder.decode(value, { stream: true });
        const lines = buffer.split("\n");
        buffer = lines.pop() || "";
        for (const line of lines) {
          if (line.startsWith("data:")) {
            const data = line.slice(5);
            const json = JSON.parse(data);
            if (json.output.audio.data) {
              yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
            }
          }
        }
      }
      clearTimeout(requestTimeoutId);
      reader.releaseLock();
    } catch (e) {
      console.log("[Request] failed to make a speech request", e);
      throw e;
    }
  }
  async chat(options: ChatOptions) {
    const modelConfig = {
      ...useAppConfig.getState().modelConfig,
@ -273,5 +341,75 @@ export class QwenApi implements LLMApi {
  async models(): Promise<LLMModel[]> {
    return [];
  }
  // 播放 PCM base64 数据
  private async PCMBase64ToAudioBuffer(base64Data: string) {
    try {
      // 解码 base64
      const binaryString = atob(base64Data);
      const bytes = new Uint8Array(binaryString.length);
      for (let i = 0; i < binaryString.length; i++) {
        bytes[i] = binaryString.charCodeAt(i);
      }
      // 转换为 AudioBuffer
      const audioBuffer = await this.convertToAudioBuffer(bytes);
      return audioBuffer;
    } catch (error) {
      console.error("播放 PCM 数据失败:", error);
      throw error;
    }
  }
  // 将 PCM 字节数据转换为 AudioBuffer
  private convertToAudioBuffer(pcmData: Uint8Array) {
    if (!this.audioContext) {
      this.audioContext = new (window.AudioContext ||
        window.webkitAudioContext)();
    }
    const audioContext = this.audioContext;
    const channels = 1;
    const sampleRate = 24000;
    return new Promise<AudioBuffer>((resolve, reject) => {
      try {
        let float32Array;
        // 16位 PCM 转换为 32位浮点数
        float32Array = this.pcm16ToFloat32(pcmData);
        // 创建 AudioBuffer
        const audioBuffer = audioContext.createBuffer(
          channels,
          float32Array.length / channels,
          sampleRate,
        );
        // 复制数据到 AudioBuffer
        for (let channel = 0; channel < channels; channel++) {
          const channelData = audioBuffer.getChannelData(channel);
          for (let i = 0; i < channelData.length; i++) {
            channelData[i] = float32Array[i * channels + channel];
          }
        }
        resolve(audioBuffer);
      } catch (error) {
        reject(error);
      }
    });
  }
  // 16位 PCM 转 32位浮点数
  private pcm16ToFloat32(pcmData: Uint8Array) {
    const length = pcmData.length / 2;
    const float32Array = new Float32Array(length);
    for (let i = 0; i < length; i++) {
      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
      float32Array[i] = int16Signed / 32768;
    }
    return float32Array;
  }
 }
 export { Alibaba };
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@ -101,8 +101,6 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
  DEFAULT_TTS_ENGINE,
  ModelProvider,
  Path,
  REQUEST_TIMEOUT_MS,
  ServiceProvider,
@ -1286,6 +1284,7 @@ function _Chat() {
  const accessStore = useAccessStore();
  const [speechStatus, setSpeechStatus] = useState(false);
  const [speechLoading, setSpeechLoading] = useState(false);
  const [speechCooldown, setSpeechCooldown] = useState(false);
  async function openaiSpeech(text: string) {
    if (speechStatus) {
@ -1293,14 +1292,14 @@ function _Chat() {
      setSpeechStatus(false);
    } else {
      var api: ClientApi;
      api = new ClientApi(ModelProvider.GPT);
      const config = useAppConfig.getState();
      api = new ClientApi(config.ttsConfig.modelProvider);
      setSpeechLoading(true);
      ttsPlayer.init();
-      let audioBuffer: ArrayBuffer;
+      let audioBuffer: ArrayBuffer | AudioBuffer;
      const { markdownToTxt } = require("markdown-to-txt");
      const textContent = markdownToTxt(text);
-      if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
+      if (config.ttsConfig.engine === "Edge") {
        const edgeVoiceName = accessStore.edgeVoiceName();
        const tts = new MsEdgeTTS();
        await tts.setMetadata(
@ -1308,6 +1307,33 @@ function _Chat() {
          OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
        );
        audioBuffer = await tts.toArrayBuffer(textContent);
        playSpeech(audioBuffer);
      } else {
        if (api.llm.streamSpeech) {
          // 使用流式播放，边接收边播放
          setSpeechStatus(true);
          ttsPlayer.startStreamPlay(() => {
            setSpeechStatus(false);
          });
          try {
            for await (const chunk of api.llm.streamSpeech({
              model: config.ttsConfig.model,
              input: textContent,
              voice: config.ttsConfig.voice,
              speed: config.ttsConfig.speed,
            })) {
              ttsPlayer.addToQueue(chunk);
            }
            ttsPlayer.finishStreamPlay();
          } catch (e) {
            console.error("[Stream Speech]", e);
            showToast(prettyObject(e));
            setSpeechStatus(false);
            ttsPlayer.stop();
          } finally {
            setSpeechLoading(false);
          }
        } else {
          audioBuffer = await api.llm.speech({
            model: config.ttsConfig.model,
@ -1315,7 +1341,13 @@ function _Chat() {
            voice: config.ttsConfig.voice,
            speed: config.ttsConfig.speed,
          });
          playSpeech(audioBuffer);
        }
      }
    }
  }
  function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) {
    setSpeechStatus(true);
    ttsPlayer
      .play(audioBuffer, () => {
@ -1328,7 +1360,6 @@ function _Chat() {
      })
      .finally(() => setSpeechLoading(false));
  }
  }
  const context: RenderMessage[] = useMemo(() => {
    return session.mask.hideContext ? [] : session.mask.context.slice();
--- a/app/components/tts-config.tsx
+++ b/app/components/tts-config.tsx
@ -3,10 +3,9 @@ import { TTSConfig, TTSConfigValidator } from "../store";
 import Locale from "../locales";
 import { ListItem, Select } from "./ui-lib";
 import {
-  DEFAULT_TTS_ENGINE,
+    ServiceProvider,
-  DEFAULT_TTS_ENGINES,
+    TTS_CONFIGS,
-  DEFAULT_TTS_MODELS,
+    TTSEngineType
  DEFAULT_TTS_VOICES,
 } from "../constant";
 import { InputRange } from "./input-range";
@ -48,22 +47,33 @@ export function TTSConfigList(props: {
        <Select
          value={props.ttsConfig.engine}
          onChange={(e) => {
            const newEngine = e.currentTarget.value as TTSEngineType;
            props.updateConfig(
-              (config) =>
+              (config) => {
-                (config.engine = TTSConfigValidator.engine(
+                config.engine = TTSConfigValidator.engine(newEngine);
-                  e.currentTarget.value,
+                const engineConfig = TTS_CONFIGS[newEngine];
-                )),
+                config.model = TTSConfigValidator.model(
                    engineConfig.Model[0] || ""
                );
                config.voice = TTSConfigValidator.voice(
                    engineConfig.Voices[0] || ""
                );
                config.modelProvider = TTSConfigValidator.modelProvider(
                    engineConfig.ModelProvider
                );
              }
            );
          }}
        >
-          {DEFAULT_TTS_ENGINES.map((v, i) => (
+          {Object.keys(TTS_CONFIGS).map((v, i) => (
            <option value={v} key={i}>
-              {v}
+              {v}-TTS
            </option>
          ))}
        </Select>
      </ListItem>
-      {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
+      {(props.ttsConfig.engine === ServiceProvider.OpenAI || 
        props.ttsConfig.engine === ServiceProvider.Alibaba) && (
        <>
          <ListItem title={Locale.Settings.TTS.Model}>
            <Select
@ -77,7 +87,7 @@ export function TTSConfigList(props: {
                );
              }}
            >
-              {DEFAULT_TTS_MODELS.map((v, i) => (
+              {TTS_CONFIGS[props.ttsConfig.engine]!.Model.map((v, i) => (
                <option value={v} key={i}>
                  {v}
                </option>
@ -99,7 +109,7 @@ export function TTSConfigList(props: {
                );
              }}
            >
-              {DEFAULT_TTS_VOICES.map((v, i) => (
+              {TTS_CONFIGS[props.ttsConfig.engine]!.Voices.map((v, i) => (
                <option value={v} key={i}>
                  {v}
                </option>
--- a/app/constant.ts
+++ b/app/constant.ts
@ -232,6 +232,7 @@ export const Alibaba = {
    }
    return `v1/services/aigc/text-generation/generation`;
  },
  SpeechPath: "v1/services/aigc/multimodal-generation/generation",
 };
 export const Tencent = {
@ -461,19 +462,49 @@ export const KnowledgeCutOffDate: Record<string, string> = {
  "deepseek-coder": "2024-07",
 };
-export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
+export const DEFAULT_TTS_ENGINE = ServiceProvider.OpenAI;
 export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
 export const DEFAULT_TTS_MODEL = "tts-1";
 export const DEFAULT_TTS_VOICE = "alloy";
-export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
+
-export const DEFAULT_TTS_VOICES = [
+export const OPENAI_TTS = {
-  "alloy",
+    Provider: ServiceProvider.OpenAI,
-  "echo",
+    ModelProvider: ModelProvider.GPT,
-  "fable",
+    Model: ["tts-1", "tts-1-hd"],
-  "onyx",
+    Voices: ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
-  "nova",
+} as const;
-  "shimmer",
+
-];
+export const ALIBABA_TTS = {
    Provider: ServiceProvider.Alibaba,
    ModelProvider: ModelProvider.Qwen,
    Model: ["qwen-tts", "qwen-tts-latest"],
    Voices: ["Chelsie", "Cherry", "Ethan", "Serena", "Dylan", "Jada", "Sunny"],
 } as const;
 export const EDGE_TTS = {
    Provider: "Edge" as const,
    ModelProvider: ModelProvider.GPT,
    Model: [] as string[],
    Voices: [] as string[],
 } as const;
 export type TTSEngineType = ServiceProvider.OpenAI | ServiceProvider.Alibaba | "Edge";
 export const DEFAULT_TTS_ENGINES = [ServiceProvider.OpenAI, ServiceProvider.Alibaba, "Edge"] as const;
 export const DEFAULT_TTS_MODELS = [...OPENAI_TTS.Model, ...ALIBABA_TTS.Model] as const;
 export const DEFAULT_TTS_VOICES = [...OPENAI_TTS.Voices, ...ALIBABA_TTS.Voices] as const;
 interface TTSConfigItem {
    Provider: ServiceProvider | "Edge";
    Model: readonly string[];
    Voices: readonly string[];
    ModelProvider: ModelProvider;
 }
 export const TTS_CONFIGS: Record<TTSEngineType, TTSConfigItem> = {
    [ServiceProvider.OpenAI]: OPENAI_TTS,
    [ServiceProvider.Alibaba]: ALIBABA_TTS,
    Edge: EDGE_TTS,
 } as const;
 export const VISION_MODEL_REGEXES = [
  /vision/,
@ -920,3 +951,4 @@ export const DEFAULT_GA_ID = "G-89WN60ZK2E";
 export const SAAS_CHAT_URL = "https://nextchat.club";
 export const SAAS_CHAT_UTM_URL = "https://nextchat.club?utm=github";
--- a/app/store/config.ts
+++ b/app/store/config.ts
@ -13,6 +13,8 @@ import {
  DEFAULT_TTS_VOICES,
  StoreKey,
  ServiceProvider,
  TTSEngineType,
  ModelProvider,
 } from "../constant";
 import { createPersistStore } from "../utils/store";
 import type { Voice } from "rt-client";
@ -20,7 +22,6 @@ import type { Voice } from "rt-client";
 export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
 export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
 export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
 export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
 export enum SubmitKey {
  Enter = "Enter",
@ -86,7 +87,8 @@ export const DEFAULT_CONFIG = {
  ttsConfig: {
    enable: false,
    autoplay: false,
-    engine: DEFAULT_TTS_ENGINE,
+    modelProvider: ModelProvider.GPT,
    engine: DEFAULT_TTS_ENGINE as TTSEngineType,
    model: DEFAULT_TTS_MODEL,
    voice: DEFAULT_TTS_VOICE,
    speed: 1.0,
@ -126,18 +128,21 @@ export function limitNumber(
 }
 export const TTSConfigValidator = {
-  engine(x: string) {
+  engine(x: string | TTSEngineType): TTSEngineType {
    return x as TTSEngineType;
  },
-  model(x: string) {
+  model(x: string): TTSModelType {
    return x as TTSModelType;
  },
-  voice(x: string) {
+  voice(x: string): TTSVoiceType {
    return x as TTSVoiceType;
  },
-  speed(x: number) {
+  speed(x: number): number {
    return limitNumber(x, 0.25, 4.0, 1.0);
  },
  modelProvider(x: string): ModelProvider {
    return x as ModelProvider;
  },
 };
 export const ModalConfigValidator = {
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@ -1,25 +1,48 @@
 type TTSPlayer = {
  init: () => void;
-  play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
+  play: (
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
  ) => Promise<void>;
  playQueue: (
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
  ) => Promise<void>;
  addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
  startStreamPlay: (onended: () => void | null) => void;
  finishStreamPlay: () => void;
  stop: () => void;
 };
 export function createTTSPlayer(): TTSPlayer {
  let audioContext: AudioContext | null = null;
  let audioBufferSourceNode: AudioBufferSourceNode | null = null;
  let isPlaying = false;
  let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
  let currentOnended: (() => void | null) | null = null;
  let isStreamMode = false;
  let streamFinished = false;
  const init = () => {
    console.log("[TTSPlayer] init");
    audioContext = new (window.AudioContext || window.webkitAudioContext)();
    audioContext.suspend();
  };
-  const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => {
+  const play = async (
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
  ) => {
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
    }
-
+    let buffer: AudioBuffer;
-    const buffer = await audioContext!.decodeAudioData(audioBuffer);
+    if (audioBuffer instanceof AudioBuffer) {
      buffer = audioBuffer;
    } else {
      buffer = await audioContext!.decodeAudioData(audioBuffer);
    }
    audioBufferSourceNode = audioContext!.createBufferSource();
    audioBufferSourceNode.buffer = buffer;
    audioBufferSourceNode.connect(audioContext!.destination);
@ -29,17 +52,109 @@ export function createTTSPlayer(): TTSPlayer {
    audioBufferSourceNode.onended = onended;
  };
-  const stop = () => {
+  const playNext = async () => {
    if (playQueue.length === 0) {
      // 在流模式下，如果队列为空但流还没结束，等待
      if (isStreamMode && !streamFinished) {
        setTimeout(() => playNext(), 100);
        return;
      }
      isPlaying = false;
      isStreamMode = false;
      streamFinished = false;
      if (currentOnended) {
        currentOnended();
        currentOnended = null;
      }
      return;
    }
    const nextBuffer = playQueue.shift()!;
    let buffer: AudioBuffer;
    if (nextBuffer instanceof AudioBuffer) {
      buffer = nextBuffer;
    } else {
      buffer = await audioContext!.decodeAudioData(nextBuffer);
    }
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
    }
    audioBufferSourceNode = audioContext!.createBufferSource();
    audioBufferSourceNode.buffer = buffer;
    audioBufferSourceNode.connect(audioContext!.destination);
    audioBufferSourceNode.onended = () => {
      playNext();
    };
    await audioContext!.resume();
    audioBufferSourceNode.start();
  };
  const playQueueMethod = async (
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
  ) => {
    playQueue = [...audioBuffers];
    currentOnended = onended;
    if (!isPlaying) {
      isPlaying = true;
      await playNext();
    }
  };
  const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
    if (streamFinished) {
      return;
    }
    playQueue.push(audioBuffer);
  };
  const startStreamPlay = (onended: () => void | null) => {
    isStreamMode = true;
    streamFinished = false;
    playQueue = [];
    currentOnended = onended;
    if (!isPlaying) {
      isPlaying = true;
      playNext();
    }
  };
  const finishStreamPlay = () => {
    streamFinished = true;
  };
  const stop = async () => {
    console.log("[TTSPlayer] stop");
    playQueue = [];
    isPlaying = false;
    isStreamMode = false;
    streamFinished = true;
    currentOnended = null;
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
      audioBufferSourceNode = null;
    }
    if (audioContext) {
-      audioContext.close();
+      await audioContext.close();
      audioContext = null;
    }
  };
-  return { init, play, stop };
+  return {
    init,
    play,
    playQueue: playQueueMethod,
    addToQueue,
    startStreamPlay,
    finishStreamPlay,
    stop,
  };
 }
--- a/package.json
+++ b/package.json
@ -93,5 +93,9 @@
  "resolutions": {
    "lint-staged/yaml": "^2.2.2"
  },
-  "packageManager": "yarn@1.22.19"
+  "packageManager": "yarn@1.22.19",
  "volta": {
    "node": "20.19.4",
    "yarn": "1.22.22"
  }
 }