Merge bf999b91a5 into 995bef73de

feat: 增强音频播放管理，新增 TTSPlayManager 类，优化流式语音合成逻辑，支持 PCM 数据和 base64 转换
Merge pull request #6599 from DreamRivulet/add-support-GPT5
2025-09-17 16:56:37 +08:00 · 2025-08-21 05:48:13 +00:00 · 2025-08-21 13:47:58 +08:00 · 2025-08-10 17:21:12 +08:00 · 2025-08-09 17:03:49 +08:00 · 2025-08-09 17:03:49 +08:00
6 changed files with 283 additions and 187 deletions
--- a/app/client/api.ts
+++ b/app/client/api.ts
@ -25,6 +25,7 @@ import { XAIApi } from "./platforms/xai";
 import { ChatGLMApi } from "./platforms/glm";
 import { SiliconflowApi } from "./platforms/siliconflow";
 import { Ai302Api } from "./platforms/ai302";
+import type { TTSPlayManager } from "../utils/audio";

 export const ROLES = ["system", "user", "assistant"] as const;
 export type MessageRole = (typeof ROLES)[number];
@ -108,7 +109,10 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
  abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
-  abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
+  abstract streamSpeech?(
+    options: SpeechOptions,
+    audioManager?: TTSPlayManager,
+  ): AsyncGenerator<AudioBuffer>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@ -8,6 +8,7 @@ import {
  usePluginStore,
  FunctionToolItem,
 } from "@/app/store";
+import { TTSPlayManager } from "@/app/utils/audio";
 import {
  preProcessImageContentForAlibabaDashScope,
  streamWithThink,
@ -62,7 +63,6 @@ interface RequestPayload {
 }

 export class QwenApi implements LLMApi {
-  private static audioContext: AudioContext | null = null;
  path(path: string): string {
    const accessStore = useAccessStore.getState();

@ -97,7 +97,10 @@ export class QwenApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

-  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
+  async *streamSpeech(
+    options: SpeechOptions,
+    audioManager?: TTSPlayManager,
+  ): AsyncGenerator<AudioBuffer> {
    if (!options.input || !options.model) {
      throw new Error("Missing required parameters: input and model");
    }
@ -112,6 +115,10 @@ export class QwenApi implements LLMApi {
    };
    const controller = new AbortController();
    options.onController?.(controller);
+
+    if (audioManager) {
+      audioManager.setStreamController(controller);
+    }
    try {
      const speechPath = this.path(Alibaba.SpeechPath);
      const speechPayload = {
@ -151,7 +158,10 @@ export class QwenApi implements LLMApi {
            if (line.startsWith("data:")) {
              const json = JSON.parse(data);
              if (json.output?.audio?.data) {
-                yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
+                yield await audioManager!.pcmBase64ToAudioBuffer(
+                  json.output.audio.data,
+                  { channels: 1, sampleRate: 24000, bitDepth: 16 },
+                );
              }
            }
          } catch (parseError) {
@ -165,8 +175,17 @@ export class QwenApi implements LLMApi {
      }
      reader.releaseLock();
    } catch (e) {
+      // 如果是用户主动取消（AbortError），则不作为错误处理
+      if (e instanceof Error && e.name === "AbortError") {
+        console.log("[Request] Stream speech was aborted by user");
+        return; // 正常退出，不抛出错误
+      }
      console.log("[Request] failed to make a speech request", e);
      throw e;
+    } finally {
+      if (audioManager) {
+        audioManager.clearStreamController();
+      }
    }
  }

@ -356,79 +375,5 @@ export class QwenApi implements LLMApi {
  async models(): Promise<LLMModel[]> {
    return [];
  }
-
-  // 播放 PCM base64 数据
-  private async PCMBase64ToAudioBuffer(base64Data: string) {
-    try {
-      // 解码 base64
-      const binaryString = atob(base64Data);
-      const bytes = new Uint8Array(binaryString.length);
-      for (let i = 0; i < binaryString.length; i++) {
-        bytes[i] = binaryString.charCodeAt(i);
-      }
-
-      // 转换为 AudioBuffer
-      const audioBuffer = await this.convertToAudioBuffer(bytes);
-
-      return audioBuffer;
-    } catch (error) {
-      console.error("播放 PCM 数据失败:", error);
-      throw error;
-    }
-  }
-
-  private static getAudioContext(): AudioContext {
-    if (!QwenApi.audioContext) {
-      QwenApi.audioContext = new (window.AudioContext ||
-        window.webkitAudioContext)();
-    }
-    return QwenApi.audioContext;
-  }
-
-  // 将 PCM 字节数据转换为 AudioBuffer
-  private convertToAudioBuffer(pcmData: Uint8Array) {
-    const audioContext = QwenApi.getAudioContext();
-    const channels = 1;
-    const sampleRate = 24000;
-    return new Promise<AudioBuffer>((resolve, reject) => {
-      try {
-        let float32Array;
-        // 16位 PCM 转换为 32位浮点数
-        float32Array = this.pcm16ToFloat32(pcmData);
-
-        // 创建 AudioBuffer
-        const audioBuffer = audioContext.createBuffer(
-          channels,
-          float32Array.length / channels,
-          sampleRate,
-        );
-
-        // 复制数据到 AudioBuffer
-        for (let channel = 0; channel < channels; channel++) {
-          const channelData = audioBuffer.getChannelData(channel);
-          for (let i = 0; i < channelData.length; i++) {
-            channelData[i] = float32Array[i * channels + channel];
-          }
-        }
-
-        resolve(audioBuffer);
-      } catch (error) {
-        reject(error);
-      }
-    });
-  }
-  // 16位 PCM 转 32位浮点数
-  private pcm16ToFloat32(pcmData: Uint8Array) {
-    const length = pcmData.length / 2;
-    const float32Array = new Float32Array(length);
-
-    for (let i = 0; i < length; i++) {
-      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
-      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
-      float32Array[i] = int16Signed / 32768;
-    }
-
-    return float32Array;
-  }
 }
 export { Alibaba };
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@ -200,6 +200,7 @@ export class ChatGPTApi implements LLMApi {
      options.config.model.startsWith("o1") ||
      options.config.model.startsWith("o3") ||
      options.config.model.startsWith("o4-mini");
+    const isGpt5 =  options.config.model.startsWith("gpt-5");
    if (isDalle3) {
      const prompt = getMessageTextContent(
        options.messages.slice(-1)?.pop() as any,
@ -230,7 +231,7 @@ export class ChatGPTApi implements LLMApi {
        messages,
        stream: options.config.stream,
        model: modelConfig.model,
-        temperature: !isO1OrO3 ? modelConfig.temperature : 1,
+        temperature: (!isO1OrO3 && !isGpt5) ? modelConfig.temperature : 1,
        presence_penalty: !isO1OrO3 ? modelConfig.presence_penalty : 0,
        frequency_penalty: !isO1OrO3 ? modelConfig.frequency_penalty : 0,
        top_p: !isO1OrO3 ? modelConfig.top_p : 1,
@ -238,7 +239,13 @@ export class ChatGPTApi implements LLMApi {
        // Please do not ask me why not send max_tokens, no reason, this param is just shit, I dont want to explain anymore.
      };

-      if (isO1OrO3) {
+      if (isGpt5) {
+  	// Remove max_tokens if present
+  	delete requestPayload.max_tokens;
+  	// Add max_completion_tokens (or max_completion_tokens if that's what you meant)
+  	requestPayload["max_completion_tokens"] = modelConfig.max_tokens;
+
+      } else if (isO1OrO3) {
        // by default the o1/o3 models will not attempt to produce output that includes markdown formatting
        // manually add "Formatting re-enabled" developer message to encourage markdown inclusion in model responses
        // (https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/reasoning?tabs=python-secure#markdown-output)
@ -251,8 +258,9 @@ export class ChatGPTApi implements LLMApi {
        requestPayload["max_completion_tokens"] = modelConfig.max_tokens;
      }

+
      // add max_tokens to vision model
-      if (visionModel && !isO1OrO3) {
+      if (visionModel && !isO1OrO3 && ! isGpt5) {
        requestPayload["max_tokens"] = Math.max(modelConfig.max_tokens, 4000);
      }
    }
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@ -1340,12 +1340,15 @@ function _Chat() {
          });

          try {
-            for await (const chunk of api.llm.streamSpeech({
-              model: config.ttsConfig.model,
-              input: textContent,
-              voice: config.ttsConfig.voice,
-              speed: config.ttsConfig.speed,
-            })) {
+            for await (const chunk of api.llm.streamSpeech(
+              {
+                model: config.ttsConfig.model,
+                input: textContent,
+                voice: config.ttsConfig.voice,
+                speed: config.ttsConfig.speed,
+              },
+              ttsPlayer,
+            )) {
              ttsPlayer.addToQueue(chunk);
            }
            ttsPlayer.finishStreamPlay();
--- a/app/constant.ts
+++ b/app/constant.ts
@ -524,6 +524,7 @@ export const VISION_MODEL_REGEXES = [
  /o3/,
  /o4-mini/,
  /grok-4/i,
+  /gpt-5/
 ];

 export const EXCLUDE_VISION_MODEL_REGEXES = [/claude-3-5-haiku-20241022/];
@ -548,6 +549,11 @@ const openaiModels = [
  "gpt-4.1-nano-2025-04-14",
  "gpt-4.5-preview",
  "gpt-4.5-preview-2025-02-27",
+  "gpt-5-chat",
+  "gpt-5-mini",
+  "gpt-5-nano",
+  "gpt-5",
+  "gpt-5-chat-2025-01-01-preview",
  "gpt-4o",
  "gpt-4o-2024-05-13",
  "gpt-4o-2024-08-06",
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@ -4,157 +4,287 @@ type TTSPlayer = {
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
  ) => Promise<void>;
-  playQueue: (
+  playQueueMethod: (
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
  ) => Promise<void>;
  addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
  startStreamPlay: (onended: () => void | null) => void;
  finishStreamPlay: () => void;
+  setStreamController: (controller: AbortController) => void;
+  clearStreamController: () => void;
  stop: () => void;
+  pcmBase64ToAudioBuffer: (
+    base64Data: string,
+    config?: PCMConfig,
+  ) => Promise<AudioBuffer>;
+  pcmDataToAudioBuffer: (
+    pcmData: Uint8Array,
+    config?: PCMConfig,
+  ) => Promise<AudioBuffer>;
 };

-export function createTTSPlayer(): TTSPlayer {
-  let audioContext: AudioContext | null = null;
-  let audioBufferSourceNode: AudioBufferSourceNode | null = null;
-  let isPlaying = false;
-  let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
-  let currentOnended: (() => void | null) | null = null;
-  let isStreamMode = false;
-  let streamFinished = false;
+// Audio processing utilities
+export interface PCMConfig {
+  channels?: number;
+  sampleRate?: number;
+  bitDepth?: 16 | 24 | 32;
+}

-  const init = () => {
-    console.log("[TTSPlayer] init");
-    audioContext = new (window.AudioContext || window.webkitAudioContext)();
-    audioContext.suspend();
-  };
+export class TTSPlayManager implements TTSPlayer {
+  private static audioContext: AudioContext | null = null;
+  private audioBufferSourceNode: AudioBufferSourceNode | null = null;
+  private isPlaying = false;
+  private playQueue: (ArrayBuffer | AudioBuffer)[] = [];
+  private currentOnended: (() => void | null) | null = null;
+  private isStreamMode = false;
+  private streamFinished = false;
+  private streamController: AbortController | null = null;

-  const play = async (
+  get getAudioContext() {
+    if (!TTSPlayManager.audioContext) {
+      TTSPlayManager.audioContext = new (window.AudioContext ||
+        window.webkitAudioContext)();
+    }
+    return TTSPlayManager.audioContext;
+  }
+
+  init() {
+    console.log("[TTSPlayManager] init");
+    if (TTSPlayManager.audioContext) {
+      return;
+    }
+    this.getAudioContext.suspend();
+  }
+
+  async play(
    audioBuffer: ArrayBuffer | AudioBuffer,
    onended: () => void | null,
-  ) => {
-    if (audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
+  ) {
+    if (this.audioBufferSourceNode) {
+      this.audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.disconnect();
    }
    let buffer: AudioBuffer;
    if (audioBuffer instanceof AudioBuffer) {
      buffer = audioBuffer;
    } else {
-      buffer = await audioContext!.decodeAudioData(audioBuffer);
+      buffer = await TTSPlayManager.audioContext!.decodeAudioData(audioBuffer);
    }
-    audioBufferSourceNode = audioContext!.createBufferSource();
-    audioBufferSourceNode.buffer = buffer;
-    audioBufferSourceNode.connect(audioContext!.destination);
-    audioContext!.resume().then(() => {
-      audioBufferSourceNode!.start();
+    this.audioBufferSourceNode =
+      TTSPlayManager.audioContext!.createBufferSource();
+    this.audioBufferSourceNode.buffer = buffer;
+    this.audioBufferSourceNode.connect(
+      TTSPlayManager.audioContext!.destination,
+    );
+    this.getAudioContext.resume().then(() => {
+      this.audioBufferSourceNode!.start();
    });
-    audioBufferSourceNode.onended = onended;
-  };
+    this.audioBufferSourceNode.onended = onended;
+  }

-  const playNext = async () => {
-    if (playQueue.length === 0) {
+  async stop() {
+    console.log("[TTSPlayer] stop");
+
+    // 首先中断流式请求
+    try {
+      if (this.streamController && !this.streamController.signal.aborted) {
+        console.log("[TTSPlayer] Aborting stream request");
+        this.streamController.abort();
+      }
+    } catch (e) {
+      // 忽略中断请求时的错误
+      console.log("[TTSPlayer] Error while aborting stream:", e);
+    }
+    this.clearStreamController();
+
+    // 清理播放状态
+    this.playQueue = [];
+    this.isPlaying = false;
+    this.isStreamMode = false;
+    this.streamFinished = true;
+    this.currentOnended = null;
+
+    // 停止音频播放
+    if (this.audioBufferSourceNode) {
+      this.audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.disconnect();
+      this.audioBufferSourceNode = null;
+    }
+
+    // 关闭音频上下文
+    if (TTSPlayManager.audioContext) {
+      await TTSPlayManager.audioContext.close();
+      TTSPlayManager.audioContext = null;
+    }
+  }
+
+  async playNext() {
+    if (this.playQueue.length === 0) {
      // 在流模式下，如果队列为空但流还没结束，等待
-      if (isStreamMode && !streamFinished) {
-        setTimeout(() => playNext(), 100);
+      if (this.isStreamMode && !this.streamFinished) {
+        setTimeout(() => this.playNext(), 100);
        return;
      }

-      isPlaying = false;
-      isStreamMode = false;
-      streamFinished = false;
-      if (currentOnended) {
-        currentOnended();
-        currentOnended = null;
+      this.isPlaying = false;
+      this.isStreamMode = false;
+      this.streamFinished = false;
+      if (this.currentOnended) {
+        this.currentOnended();
+        this.currentOnended = null;
      }
      return;
    }

-    const nextBuffer = playQueue.shift()!;
+    const nextBuffer = this.playQueue.shift()!;
    let buffer: AudioBuffer;
    if (nextBuffer instanceof AudioBuffer) {
      buffer = nextBuffer;
    } else {
-      buffer = await audioContext!.decodeAudioData(nextBuffer);
+      buffer = await this.getAudioContext.decodeAudioData(nextBuffer);
    }

-    if (audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
+    if (this.audioBufferSourceNode) {
+      this.audioBufferSourceNode.stop();
+      this.audioBufferSourceNode.disconnect();
    }

-    audioBufferSourceNode = audioContext!.createBufferSource();
-    audioBufferSourceNode.buffer = buffer;
-    audioBufferSourceNode.connect(audioContext!.destination);
-    audioBufferSourceNode.onended = () => {
-      playNext();
+    this.audioBufferSourceNode = this.getAudioContext.createBufferSource();
+    this.audioBufferSourceNode.buffer = buffer;
+    this.audioBufferSourceNode.connect(this.getAudioContext.destination);
+    this.audioBufferSourceNode.onended = () => {
+      this.playNext();
    };

-    await audioContext!.resume();
-    audioBufferSourceNode.start();
-  };
+    await this.getAudioContext.resume();
+    this.audioBufferSourceNode.start();
+  }

-  const playQueueMethod = async (
+  async playQueueMethod(
    audioBuffers: (ArrayBuffer | AudioBuffer)[],
    onended: () => void | null,
-  ) => {
-    playQueue = [...audioBuffers];
-    currentOnended = onended;
-    if (!isPlaying) {
-      isPlaying = true;
-      await playNext();
+  ) {
+    this.playQueue = [...audioBuffers];
+    this.currentOnended = onended;
+    if (!this.isPlaying) {
+      this.isPlaying = true;
+      await this.playNext();
    }
-  };
+  }

-  const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
-    if (streamFinished) {
+  addToQueue(audioBuffer: ArrayBuffer | AudioBuffer) {
+    if (this.streamFinished) {
      return;
    }
-    playQueue.push(audioBuffer);
-  };
+    this.playQueue.push(audioBuffer);
+  }

-  const startStreamPlay = (onended: () => void | null) => {
-    isStreamMode = true;
-    streamFinished = false;
-    playQueue = [];
-    currentOnended = onended;
-
-    if (!isPlaying) {
-      isPlaying = true;
-      playNext();
+  startStreamPlay(onended: () => void | null) {
+    this.isStreamMode = true;
+    this.streamFinished = false;
+    this.playQueue = [];
+    this.currentOnended = onended;
+    if (!this.isPlaying) {
+      this.isPlaying = true;
+      this.playNext();
    }
-  };
+  }

-  const finishStreamPlay = () => {
-    streamFinished = true;
-  };
+  finishStreamPlay() {
+    this.streamFinished = true;
+  }

-  const stop = async () => {
-    console.log("[TTSPlayer] stop");
-    playQueue = [];
-    isPlaying = false;
-    isStreamMode = false;
-    streamFinished = true;
-    currentOnended = null;
+  // 设置流式请求控制器，用于在 stop 时中断请求
+  setStreamController(controller: AbortController) {
+    this.streamController = controller;
+  }

-    if (audioBufferSourceNode) {
-      audioBufferSourceNode.stop();
-      audioBufferSourceNode.disconnect();
-      audioBufferSourceNode = null;
+  // 清除流式请求控制器
+  clearStreamController() {
+    this.streamController = null;
+  }
+
+  // 将 base64 PCM 数据转换为 AudioBuffer
+  async pcmBase64ToAudioBuffer(
+    base64Data: string,
+    config: PCMConfig = {},
+  ): Promise<AudioBuffer> {
+    try {
+      // 解码 base64
+      const binaryString = atob(base64Data);
+      const bytes = new Uint8Array(binaryString.length);
+      for (let i = 0; i < binaryString.length; i++) {
+        bytes[i] = binaryString.charCodeAt(i);
+      }
+
+      // 转换为 AudioBuffer
+      return await this.pcmDataToAudioBuffer(bytes, config);
+    } catch (error) {
+      console.error("Failed to convert PCM base64 to AudioBuffer:", error);
+      throw error;
    }
-    if (audioContext) {
-      await audioContext.close();
-      audioContext = null;
-    }
-  };
+  }

-  return {
-    init,
-    play,
-    playQueue: playQueueMethod,
-    addToQueue,
-    startStreamPlay,
-    finishStreamPlay,
-    stop,
-  };
+  // 将 PCM 字节数据转换为 AudioBuffer
+  async pcmDataToAudioBuffer(
+    pcmData: Uint8Array,
+    config: PCMConfig = {},
+  ): Promise<AudioBuffer> {
+    const { channels = 1, sampleRate = 24000, bitDepth = 16 } = config;
+
+    const audioContext = this.getAudioContext;
+
+    return new Promise<AudioBuffer>((resolve, reject) => {
+      try {
+        let float32Array: Float32Array;
+
+        // 根据位深度选择转换方法
+        switch (bitDepth) {
+          case 16:
+            float32Array = this.pcm16ToFloat32(pcmData);
+            break;
+          default:
+            throw new Error(`Unsupported bit depth: ${bitDepth}`);
+        }
+
+        // 创建 AudioBuffer
+        const audioBuffer = audioContext.createBuffer(
+          channels,
+          float32Array.length / channels,
+          sampleRate,
+        );
+
+        // 复制数据到 AudioBuffer
+        for (let channel = 0; channel < channels; channel++) {
+          const channelData = audioBuffer.getChannelData(channel);
+          for (let i = 0; i < channelData.length; i++) {
+            channelData[i] = float32Array[i * channels + channel];
+          }
+        }
+
+        resolve(audioBuffer);
+      } catch (error) {
+        reject(error);
+      }
+    });
+  }
+
+  // 16位 PCM 转 32位浮点数
+  pcm16ToFloat32(pcmData: Uint8Array): Float32Array {
+    const length = pcmData.length / 2;
+    const float32Array = new Float32Array(length);
+
+    for (let i = 0; i < length; i++) {
+      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
+      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
+      float32Array[i] = int16Signed / 32768;
+    }
+
+    return float32Array;
+  }
+}
+
+export function createTTSPlayer(): TTSPlayManager {
+  return new TTSPlayManager();
 }
Author	SHA1	Message	Date
Evan Wu	33f8cac264	Merge `bf999b91a5` into `995bef73de`	2025-08-21 05:48:13 +00:00
EvanWu	bf999b91a5	feat: 增强音频播放管理，新增 TTSPlayManager 类，优化流式语音合成逻辑，支持 PCM 数据和 base64 转换	2025-08-21 13:47:58 +08:00
RiverRay	995bef73de	Merge pull request #6599 from DreamRivulet/add-support-GPT5 Some checks failed Run Tests / test (push) Has been cancelled Details add: model gpt-5	2025-08-10 17:21:12 +08:00
Sam	38ac502d80	Add support for GPT5	2025-08-09 17:03:49 +08:00
Sam	0511808900	use max_completion_tokens	2025-08-09 17:03:49 +08:00
Sam	42eff644b4	use max_completion_tokens	2025-08-09 17:03:49 +08:00
Sam	8ae6883784	add gpt-5	2025-08-09 17:03:49 +08:00
Sam	c0f2ab6de3	add gpt-5	2025-08-09 17:03:06 +08:00