Merge bf999b91a5 into 995bef73de

feat: 增强音频播放管理，新增 TTSPlayManager 类，优化流式语音合成逻辑，支持 PCM 数据和 base64 转换
Merge pull request #6599 from DreamRivulet/add-support-GPT5
2025-09-20 10:16:38 +08:00 · 2025-08-21 05:48:13 +00:00 · 2025-08-21 13:47:58 +08:00 · 2025-08-10 17:21:12 +08:00 · 2025-08-09 17:03:49 +08:00 · 2025-08-09 17:03:49 +08:00
6 changed files with 283 additions and 187 deletions
--- a/app/client/api.ts
+++ b/app/client/api.ts
@ -25,6 +25,7 @@ import { XAIApi } from "./platforms/xai";
 import { ChatGLMApi } from "./platforms/glm";
 import { SiliconflowApi } from "./platforms/siliconflow";
 import { Ai302Api } from "./platforms/ai302";
 import type { TTSPlayManager } from "../utils/audio";
 export const ROLES = ["system", "user", "assistant"] as const;
 export type MessageRole = (typeof ROLES)[number];
@ -108,7 +109,10 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
  abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
-  abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
+  abstract streamSpeech?(
    options: SpeechOptions,
    audioManager?: TTSPlayManager,
  ): AsyncGenerator<AudioBuffer>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@ -8,6 +8,7 @@ import {
  usePluginStore,
  FunctionToolItem,
 } from "@/app/store";
 import { TTSPlayManager } from "@/app/utils/audio";
 import {
  preProcessImageContentForAlibabaDashScope,
  streamWithThink,
@ -62,7 +63,6 @@ interface RequestPayload {
 }
 export class QwenApi implements LLMApi {
  private static audioContext: AudioContext | null = null;
  path(path: string): string {
    const accessStore = useAccessStore.getState();
@ -97,7 +97,10 @@ export class QwenApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
-  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
+  async *streamSpeech(
    options: SpeechOptions,
    audioManager?: TTSPlayManager,
  ): AsyncGenerator<AudioBuffer> {
    if (!options.input || !options.model) {
      throw new Error("Missing required parameters: input and model");
    }
@ -112,6 +115,10 @@ export class QwenApi implements LLMApi {
    };
    const controller = new AbortController();
    options.onController?.(controller);
    if (audioManager) {
      audioManager.setStreamController(controller);
    }
    try {
      const speechPath = this.path(Alibaba.SpeechPath);
      const speechPayload = {
@ -151,7 +158,10 @@ export class QwenApi implements LLMApi {
            if (line.startsWith("data:")) {
              const json = JSON.parse(data);
              if (json.output?.audio?.data) {
-                yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
+                yield await audioManager!.pcmBase64ToAudioBuffer(
                  json.output.audio.data,
                  { channels: 1, sampleRate: 24000, bitDepth: 16 },
                );
              }
            }
          } catch (parseError) {
@ -165,8 +175,17 @@ export class QwenApi implements LLMApi {
      }
      reader.releaseLock();
    } catch (e) {
      // 如果是用户主动取消（AbortError），则不作为错误处理
      if (e instanceof Error && e.name === "AbortError") {
        console.log("[Request] Stream speech was aborted by user");
        return; // 正常退出，不抛出错误
      }
      console.log("[Request] failed to make a speech request", e);
      throw e;
    } finally {
      if (audioManager) {
        audioManager.clearStreamController();
      }
    }
  }
@ -356,79 +375,5 @@ export class QwenApi implements LLMApi {
  async models(): Promise<LLMModel[]> {
    return [];
  }
  // 播放 PCM base64 数据
  private async PCMBase64ToAudioBuffer(base64Data: string) {
    try {
      // 解码 base64
      const binaryString = atob(base64Data);
      const bytes = new Uint8Array(binaryString.length);
      for (let i = 0; i < binaryString.length; i++) {
        bytes[i] = binaryString.charCodeAt(i);
      }
      // 转换为 AudioBuffer
      const audioBuffer = await this.convertToAudioBuffer(bytes);
      return audioBuffer;
    } catch (error) {
      console.error("播放 PCM 数据失败:", error);
      throw error;
    }
  }
  private static getAudioContext(): AudioContext {
    if (!QwenApi.audioContext) {
      QwenApi.audioContext = new (window.AudioContext ||
        window.webkitAudioContext)();
    }
    return QwenApi.audioContext;
  }
  // 将 PCM 字节数据转换为 AudioBuffer
  private convertToAudioBuffer(pcmData: Uint8Array) {
    const audioContext = QwenApi.getAudioContext();
    const channels = 1;
    const sampleRate = 24000;
    return new Promise<AudioBuffer>((resolve, reject) => {
      try {
        let float32Array;
        // 16位 PCM 转换为 32位浮点数
        float32Array = this.pcm16ToFloat32(pcmData);
        // 创建 AudioBuffer
        const audioBuffer = audioContext.createBuffer(
          channels,
          float32Array.length / channels,
          sampleRate,
        );
        // 复制数据到 AudioBuffer
        for (let channel = 0; channel < channels; channel++) {
          const channelData = audioBuffer.getChannelData(channel);
          for (let i = 0; i < channelData.length; i++) {
            channelData[i] = float32Array[i * channels + channel];
          }
        }
        resolve(audioBuffer);
      } catch (error) {
        reject(error);
      }
    });
  }
  // 16位 PCM 转 32位浮点数
  private pcm16ToFloat32(pcmData: Uint8Array) {
    const length = pcmData.length / 2;
    const float32Array = new Float32Array(length);
    for (let i = 0; i < length; i++) {
      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
      float32Array[i] = int16Signed / 32768;
    }
    return float32Array;
  }
 }
 export { Alibaba };
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@ -200,6 +200,7 @@ export class ChatGPTApi implements LLMApi {
      options.config.model.startsWith("o1") ||
      options.config.model.startsWith("o3") ||
      options.config.model.startsWith("o4-mini");
    const isGpt5 =  options.config.model.startsWith("gpt-5");
    if (isDalle3) {
      const prompt = getMessageTextContent(
        options.messages.slice(-1)?.pop() as any,
@ -230,7 +231,7 @@ export class ChatGPTApi implements LLMApi {
        messages,
        stream: options.config.stream,
        model: modelConfig.model,
-        temperature: !isO1OrO3 ? modelConfig.temperature : 1,
+        temperature: (!isO1OrO3 && !isGpt5) ? modelConfig.temperature : 1,
        presence_penalty: !isO1OrO3 ? modelConfig.presence_penalty : 0,
        frequency_penalty: !isO1OrO3 ? modelConfig.frequency_penalty : 0,
        top_p: !isO1OrO3 ? modelConfig.top_p : 1,
@ -238,7 +239,13 @@ export class ChatGPTApi implements LLMApi {
        // Please do not ask me why not send max_tokens, no reason, this param is just shit, I dont want to explain anymore.
      };
-      if (isO1OrO3) {
+      if (isGpt5) {
  	// Remove max_tokens if present
  	delete requestPayload.max_tokens;
  	// Add max_completion_tokens (or max_completion_tokens if that's what you meant)
  	requestPayload["max_completion_tokens"] = modelConfig.max_tokens;
      } else if (isO1OrO3) {
        // by default the o1/o3 models will not attempt to produce output that includes markdown formatting
        // manually add "Formatting re-enabled" developer message to encourage markdown inclusion in model responses
        // (https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/reasoning?tabs=python-secure#markdown-output)
@ -251,8 +258,9 @@ export class ChatGPTApi implements LLMApi {
        requestPayload["max_completion_tokens"] = modelConfig.max_tokens;
      }
      // add max_tokens to vision model
-      if (visionModel && !isO1OrO3) {
+      if (visionModel && !isO1OrO3 && ! isGpt5) {
        requestPayload["max_tokens"] = Math.max(modelConfig.max_tokens, 4000);
      }
    }
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@ -1340,12 +1340,15 @@ function _Chat() {
          });
          try {
-            for await (const chunk of api.llm.streamSpeech({
+            for await (const chunk of api.llm.streamSpeech(
              {
                model: config.ttsConfig.model,
                input: textContent,
                voice: config.ttsConfig.voice,
                speed: config.ttsConfig.speed,
-            })) {
+              },
              ttsPlayer,
            )) {
              ttsPlayer.addToQueue(chunk);
            }
            ttsPlayer.finishStreamPlay();
--- a/app/constant.ts
+++ b/app/constant.ts
@ -524,6 +524,7 @@ export const VISION_MODEL_REGEXES = [
  /o3/,
  /o4-mini/,
  /grok-4/i,
  /gpt-5/
 ];
 export const EXCLUDE_VISION_MODEL_REGEXES = [/claude-3-5-haiku-20241022/];
@ -548,6 +549,11 @@ const openaiModels = [
  "gpt-4.1-nano-2025-04-14",
  "gpt-4.5-preview",
  "gpt-4.5-preview-2025-02-27",
  "gpt-5-chat",
  "gpt-5-mini",
  "gpt-5-nano",
  "gpt-5",
  "gpt-5-chat-2025-01-01-preview",
  "gpt-4o",
  "gpt-4o-2024-05-13",
  "gpt-4o-2024-08-06",
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@ -4,157 +4,287 @@ type TTSPlayer = {
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
  ) => Promise<void>;
-  playQueue: (
+  playQueueMethod: (
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
  ) => Promise<void>;
  addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
  startStreamPlay: (onended: () => void | null) => void;
  finishStreamPlay: () => void;
  setStreamController: (controller: AbortController) => void;
  clearStreamController: () => void;
  stop: () => void;
  pcmBase64ToAudioBuffer: (
    base64Data: string,
    config?: PCMConfig,
  ) => Promise<AudioBuffer>;
  pcmDataToAudioBuffer: (
    pcmData: Uint8Array,
    config?: PCMConfig,
  ) => Promise<AudioBuffer>;
 };
-export function createTTSPlayer(): TTSPlayer {
+// Audio processing utilities
-  let audioContext: AudioContext | null = null;
+export interface PCMConfig {
-  let audioBufferSourceNode: AudioBufferSourceNode | null = null;
+  channels?: number;
-  let isPlaying = false;
+  sampleRate?: number;
-  let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
+  bitDepth?: 16 | 24 | 32;
-  let currentOnended: (() => void | null) | null = null;
+}
  let isStreamMode = false;
  let streamFinished = false;
-  const init = () => {
+export class TTSPlayManager implements TTSPlayer {
-    console.log("[TTSPlayer] init");
+  private static audioContext: AudioContext | null = null;
-    audioContext = new (window.AudioContext || window.webkitAudioContext)();
+  private audioBufferSourceNode: AudioBufferSourceNode | null = null;
-    audioContext.suspend();
+  private isPlaying = false;
-  };
+  private playQueue: (ArrayBuffer | AudioBuffer)[] = [];
  private currentOnended: (() => void | null) | null = null;
  private isStreamMode = false;
  private streamFinished = false;
  private streamController: AbortController | null = null;
-  const play = async (
+  get getAudioContext() {
    if (!TTSPlayManager.audioContext) {
      TTSPlayManager.audioContext = new (window.AudioContext ||
        window.webkitAudioContext)();
    }
    return TTSPlayManager.audioContext;
  }
  init() {
    console.log("[TTSPlayManager] init");
    if (TTSPlayManager.audioContext) {
      return;
    }
    this.getAudioContext.suspend();
  }
  async play(
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
-  ) => {
+  ) {
-    if (audioBufferSourceNode) {
+    if (this.audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
+      this.audioBufferSourceNode.disconnect();
    }
    let buffer: AudioBuffer;
    if (audioBuffer instanceof AudioBuffer) {
      buffer = audioBuffer;
    } else {
-      buffer = await audioContext!.decodeAudioData(audioBuffer);
+      buffer = await TTSPlayManager.audioContext!.decodeAudioData(audioBuffer);
    }
-    audioBufferSourceNode = audioContext!.createBufferSource();
+    this.audioBufferSourceNode =
-    audioBufferSourceNode.buffer = buffer;
+      TTSPlayManager.audioContext!.createBufferSource();
-    audioBufferSourceNode.connect(audioContext!.destination);
+    this.audioBufferSourceNode.buffer = buffer;
-    audioContext!.resume().then(() => {
+    this.audioBufferSourceNode.connect(
-      audioBufferSourceNode!.start();
+      TTSPlayManager.audioContext!.destination,
    );
    this.getAudioContext.resume().then(() => {
      this.audioBufferSourceNode!.start();
    });
-    audioBufferSourceNode.onended = onended;
+    this.audioBufferSourceNode.onended = onended;
-  };
+  }
-  const playNext = async () => {
+  async stop() {
-    if (playQueue.length === 0) {
+    console.log("[TTSPlayer] stop");
    // 首先中断流式请求
    try {
      if (this.streamController && !this.streamController.signal.aborted) {
        console.log("[TTSPlayer] Aborting stream request");
        this.streamController.abort();
      }
    } catch (e) {
      // 忽略中断请求时的错误
      console.log("[TTSPlayer] Error while aborting stream:", e);
    }
    this.clearStreamController();
    // 清理播放状态
    this.playQueue = [];
    this.isPlaying = false;
    this.isStreamMode = false;
    this.streamFinished = true;
    this.currentOnended = null;
    // 停止音频播放
    if (this.audioBufferSourceNode) {
      this.audioBufferSourceNode.stop();
      this.audioBufferSourceNode.disconnect();
      this.audioBufferSourceNode = null;
    }
    // 关闭音频上下文
    if (TTSPlayManager.audioContext) {
      await TTSPlayManager.audioContext.close();
      TTSPlayManager.audioContext = null;
    }
  }
  async playNext() {
    if (this.playQueue.length === 0) {
      // 在流模式下，如果队列为空但流还没结束，等待
-      if (isStreamMode && !streamFinished) {
+      if (this.isStreamMode && !this.streamFinished) {
-        setTimeout(() => playNext(), 100);
+        setTimeout(() => this.playNext(), 100);
        return;
      }
-      isPlaying = false;
+      this.isPlaying = false;
-      isStreamMode = false;
+      this.isStreamMode = false;
-      streamFinished = false;
+      this.streamFinished = false;
-      if (currentOnended) {
+      if (this.currentOnended) {
-        currentOnended();
+        this.currentOnended();
-        currentOnended = null;
+        this.currentOnended = null;
      }
      return;
    }
-    const nextBuffer = playQueue.shift()!;
+    const nextBuffer = this.playQueue.shift()!;
    let buffer: AudioBuffer;
    if (nextBuffer instanceof AudioBuffer) {
      buffer = nextBuffer;
    } else {
-      buffer = await audioContext!.decodeAudioData(nextBuffer);
+      buffer = await this.getAudioContext.decodeAudioData(nextBuffer);
    }
-    if (audioBufferSourceNode) {
+    if (this.audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
+      this.audioBufferSourceNode.disconnect();
    }
-    audioBufferSourceNode = audioContext!.createBufferSource();
+    this.audioBufferSourceNode = this.getAudioContext.createBufferSource();
-    audioBufferSourceNode.buffer = buffer;
+    this.audioBufferSourceNode.buffer = buffer;
-    audioBufferSourceNode.connect(audioContext!.destination);
+    this.audioBufferSourceNode.connect(this.getAudioContext.destination);
-    audioBufferSourceNode.onended = () => {
+    this.audioBufferSourceNode.onended = () => {
-      playNext();
+      this.playNext();
    };
-    await audioContext!.resume();
+    await this.getAudioContext.resume();
-    audioBufferSourceNode.start();
+    this.audioBufferSourceNode.start();
-  };
+  }
-  const playQueueMethod = async (
+  async playQueueMethod(
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
-  ) => {
+  ) {
-    playQueue = [...audioBuffers];
+    this.playQueue = [...audioBuffers];
-    currentOnended = onended;
+    this.currentOnended = onended;
-    if (!isPlaying) {
+    if (!this.isPlaying) {
-      isPlaying = true;
+      this.isPlaying = true;
-      await playNext();
+      await this.playNext();
    }
  }
  };
-  const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
+  addToQueue(audioBuffer: ArrayBuffer | AudioBuffer) {
-    if (streamFinished) {
+    if (this.streamFinished) {
      return;
    }
-    playQueue.push(audioBuffer);
+    this.playQueue.push(audioBuffer);
  };
  const startStreamPlay = (onended: () => void | null) => {
    isStreamMode = true;
    streamFinished = false;
    playQueue = [];
    currentOnended = onended;
    if (!isPlaying) {
      isPlaying = true;
      playNext();
  }
  };
-  const finishStreamPlay = () => {
+  startStreamPlay(onended: () => void | null) {
-    streamFinished = true;
+    this.isStreamMode = true;
-  };
+    this.streamFinished = false;
-
+    this.playQueue = [];
-  const stop = async () => {
+    this.currentOnended = onended;
-    console.log("[TTSPlayer] stop");
+    if (!this.isPlaying) {
-    playQueue = [];
+      this.isPlaying = true;
-    isPlaying = false;
+      this.playNext();
    isStreamMode = false;
    streamFinished = true;
    currentOnended = null;
    if (audioBufferSourceNode) {
      audioBufferSourceNode.stop();
      audioBufferSourceNode.disconnect();
      audioBufferSourceNode = null;
    }
    if (audioContext) {
      await audioContext.close();
      audioContext = null;
  }
  };
-  return {
+  finishStreamPlay() {
-    init,
+    this.streamFinished = true;
-    play,
+  }
-    playQueue: playQueueMethod,
+
-    addToQueue,
+  // 设置流式请求控制器，用于在 stop 时中断请求
-    startStreamPlay,
+  setStreamController(controller: AbortController) {
-    finishStreamPlay,
+    this.streamController = controller;
-    stop,
+  }
-  };
+
  // 清除流式请求控制器
  clearStreamController() {
    this.streamController = null;
  }
  // 将 base64 PCM 数据转换为 AudioBuffer
  async pcmBase64ToAudioBuffer(
    base64Data: string,
    config: PCMConfig = {},
  ): Promise<AudioBuffer> {
    try {
      // 解码 base64
      const binaryString = atob(base64Data);
      const bytes = new Uint8Array(binaryString.length);
      for (let i = 0; i < binaryString.length; i++) {
        bytes[i] = binaryString.charCodeAt(i);
      }
      // 转换为 AudioBuffer
      return await this.pcmDataToAudioBuffer(bytes, config);
    } catch (error) {
      console.error("Failed to convert PCM base64 to AudioBuffer:", error);
      throw error;
    }
  }
  // 将 PCM 字节数据转换为 AudioBuffer
  async pcmDataToAudioBuffer(
    pcmData: Uint8Array,
    config: PCMConfig = {},
  ): Promise<AudioBuffer> {
    const { channels = 1, sampleRate = 24000, bitDepth = 16 } = config;
    const audioContext = this.getAudioContext;
    return new Promise<AudioBuffer>((resolve, reject) => {
      try {
        let float32Array: Float32Array;
        // 根据位深度选择转换方法
        switch (bitDepth) {
          case 16:
            float32Array = this.pcm16ToFloat32(pcmData);
            break;
          default:
            throw new Error(`Unsupported bit depth: ${bitDepth}`);
        }
        // 创建 AudioBuffer
        const audioBuffer = audioContext.createBuffer(
          channels,
          float32Array.length / channels,
          sampleRate,
        );
        // 复制数据到 AudioBuffer
        for (let channel = 0; channel < channels; channel++) {
          const channelData = audioBuffer.getChannelData(channel);
          for (let i = 0; i < channelData.length; i++) {
            channelData[i] = float32Array[i * channels + channel];
          }
        }
        resolve(audioBuffer);
      } catch (error) {
        reject(error);
      }
    });
  }
  // 16位 PCM 转 32位浮点数
  pcm16ToFloat32(pcmData: Uint8Array): Float32Array {
    const length = pcmData.length / 2;
    const float32Array = new Float32Array(length);
    for (let i = 0; i < length; i++) {
      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
      float32Array[i] = int16Signed / 32768;
    }
    return float32Array;
  }
 }
 export function createTTSPlayer(): TTSPlayManager {
  return new TTSPlayManager();
 }
Author	SHA1	Message	Date
Evan Wu	33f8cac264	Merge `bf999b91a5` into `995bef73de`	2025-08-21 05:48:13 +00:00
EvanWu	bf999b91a5	feat: 增强音频播放管理，新增 TTSPlayManager 类，优化流式语音合成逻辑，支持 PCM 数据和 base64 转换	2025-08-21 13:47:58 +08:00
RiverRay	995bef73de	Merge pull request #6599 from DreamRivulet/add-support-GPT5 Some checks failed Run Tests / test (push) Has been cancelled Details add: model gpt-5	2025-08-10 17:21:12 +08:00
Sam	38ac502d80	Add support for GPT5	2025-08-09 17:03:49 +08:00
Sam	0511808900	use max_completion_tokens	2025-08-09 17:03:49 +08:00
Sam	42eff644b4	use max_completion_tokens	2025-08-09 17:03:49 +08:00
Sam	8ae6883784	add gpt-5	2025-08-09 17:03:49 +08:00
Sam	c0f2ab6de3	add gpt-5	2025-08-09 17:03:06 +08:00