Compare commits

...

8 Commits

Author SHA1 Message Date
Evan Wu
33f8cac264
Merge bf999b91a5 into 995bef73de 2025-08-21 05:48:13 +00:00
EvanWu
bf999b91a5 feat: 增强音频播放管理,新增 TTSPlayManager 类,优化流式语音合成逻辑,支持 PCM 数据和 base64 转换 2025-08-21 13:47:58 +08:00
RiverRay
995bef73de
Merge pull request #6599 from DreamRivulet/add-support-GPT5
Some checks failed
Run Tests / test (push) Has been cancelled
add: model gpt-5
2025-08-10 17:21:12 +08:00
Sam
38ac502d80 Add support for GPT5 2025-08-09 17:03:49 +08:00
Sam
0511808900 use max_completion_tokens 2025-08-09 17:03:49 +08:00
Sam
42eff644b4 use max_completion_tokens 2025-08-09 17:03:49 +08:00
Sam
8ae6883784 add gpt-5 2025-08-09 17:03:49 +08:00
Sam
c0f2ab6de3 add gpt-5 2025-08-09 17:03:06 +08:00
6 changed files with 283 additions and 187 deletions

View File

@ -25,6 +25,7 @@ import { XAIApi } from "./platforms/xai";
import { ChatGLMApi } from "./platforms/glm"; import { ChatGLMApi } from "./platforms/glm";
import { SiliconflowApi } from "./platforms/siliconflow"; import { SiliconflowApi } from "./platforms/siliconflow";
import { Ai302Api } from "./platforms/ai302"; import { Ai302Api } from "./platforms/ai302";
import type { TTSPlayManager } from "../utils/audio";
export const ROLES = ["system", "user", "assistant"] as const; export const ROLES = ["system", "user", "assistant"] as const;
export type MessageRole = (typeof ROLES)[number]; export type MessageRole = (typeof ROLES)[number];
@ -108,7 +109,10 @@ export interface LLMModelProvider {
export abstract class LLMApi { export abstract class LLMApi {
abstract chat(options: ChatOptions): Promise<void>; abstract chat(options: ChatOptions): Promise<void>;
abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>; abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>; abstract streamSpeech?(
options: SpeechOptions,
audioManager?: TTSPlayManager,
): AsyncGenerator<AudioBuffer>;
abstract usage(): Promise<LLMUsage>; abstract usage(): Promise<LLMUsage>;
abstract models(): Promise<LLMModel[]>; abstract models(): Promise<LLMModel[]>;
} }

View File

@ -8,6 +8,7 @@ import {
usePluginStore, usePluginStore,
FunctionToolItem, FunctionToolItem,
} from "@/app/store"; } from "@/app/store";
import { TTSPlayManager } from "@/app/utils/audio";
import { import {
preProcessImageContentForAlibabaDashScope, preProcessImageContentForAlibabaDashScope,
streamWithThink, streamWithThink,
@ -62,7 +63,6 @@ interface RequestPayload {
} }
export class QwenApi implements LLMApi { export class QwenApi implements LLMApi {
private static audioContext: AudioContext | null = null;
path(path: string): string { path(path: string): string {
const accessStore = useAccessStore.getState(); const accessStore = useAccessStore.getState();
@ -97,7 +97,10 @@ export class QwenApi implements LLMApi {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
} }
async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> { async *streamSpeech(
options: SpeechOptions,
audioManager?: TTSPlayManager,
): AsyncGenerator<AudioBuffer> {
if (!options.input || !options.model) { if (!options.input || !options.model) {
throw new Error("Missing required parameters: input and model"); throw new Error("Missing required parameters: input and model");
} }
@ -112,6 +115,10 @@ export class QwenApi implements LLMApi {
}; };
const controller = new AbortController(); const controller = new AbortController();
options.onController?.(controller); options.onController?.(controller);
if (audioManager) {
audioManager.setStreamController(controller);
}
try { try {
const speechPath = this.path(Alibaba.SpeechPath); const speechPath = this.path(Alibaba.SpeechPath);
const speechPayload = { const speechPayload = {
@ -151,7 +158,10 @@ export class QwenApi implements LLMApi {
if (line.startsWith("data:")) { if (line.startsWith("data:")) {
const json = JSON.parse(data); const json = JSON.parse(data);
if (json.output?.audio?.data) { if (json.output?.audio?.data) {
yield this.PCMBase64ToAudioBuffer(json.output.audio.data); yield await audioManager!.pcmBase64ToAudioBuffer(
json.output.audio.data,
{ channels: 1, sampleRate: 24000, bitDepth: 16 },
);
} }
} }
} catch (parseError) { } catch (parseError) {
@ -165,8 +175,17 @@ export class QwenApi implements LLMApi {
} }
reader.releaseLock(); reader.releaseLock();
} catch (e) { } catch (e) {
// 如果是用户主动取消AbortError则不作为错误处理
if (e instanceof Error && e.name === "AbortError") {
console.log("[Request] Stream speech was aborted by user");
return; // 正常退出,不抛出错误
}
console.log("[Request] failed to make a speech request", e); console.log("[Request] failed to make a speech request", e);
throw e; throw e;
} finally {
if (audioManager) {
audioManager.clearStreamController();
}
} }
} }
@ -356,79 +375,5 @@ export class QwenApi implements LLMApi {
async models(): Promise<LLMModel[]> { async models(): Promise<LLMModel[]> {
return []; return [];
} }
// 播放 PCM base64 数据
private async PCMBase64ToAudioBuffer(base64Data: string) {
try {
// 解码 base64
const binaryString = atob(base64Data);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
// 转换为 AudioBuffer
const audioBuffer = await this.convertToAudioBuffer(bytes);
return audioBuffer;
} catch (error) {
console.error("播放 PCM 数据失败:", error);
throw error;
}
}
private static getAudioContext(): AudioContext {
if (!QwenApi.audioContext) {
QwenApi.audioContext = new (window.AudioContext ||
window.webkitAudioContext)();
}
return QwenApi.audioContext;
}
// 将 PCM 字节数据转换为 AudioBuffer
private convertToAudioBuffer(pcmData: Uint8Array) {
const audioContext = QwenApi.getAudioContext();
const channels = 1;
const sampleRate = 24000;
return new Promise<AudioBuffer>((resolve, reject) => {
try {
let float32Array;
// 16位 PCM 转换为 32位浮点数
float32Array = this.pcm16ToFloat32(pcmData);
// 创建 AudioBuffer
const audioBuffer = audioContext.createBuffer(
channels,
float32Array.length / channels,
sampleRate,
);
// 复制数据到 AudioBuffer
for (let channel = 0; channel < channels; channel++) {
const channelData = audioBuffer.getChannelData(channel);
for (let i = 0; i < channelData.length; i++) {
channelData[i] = float32Array[i * channels + channel];
}
}
resolve(audioBuffer);
} catch (error) {
reject(error);
}
});
}
// 16位 PCM 转 32位浮点数
private pcm16ToFloat32(pcmData: Uint8Array) {
const length = pcmData.length / 2;
const float32Array = new Float32Array(length);
for (let i = 0; i < length; i++) {
const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
float32Array[i] = int16Signed / 32768;
}
return float32Array;
}
} }
export { Alibaba }; export { Alibaba };

View File

@ -200,6 +200,7 @@ export class ChatGPTApi implements LLMApi {
options.config.model.startsWith("o1") || options.config.model.startsWith("o1") ||
options.config.model.startsWith("o3") || options.config.model.startsWith("o3") ||
options.config.model.startsWith("o4-mini"); options.config.model.startsWith("o4-mini");
const isGpt5 = options.config.model.startsWith("gpt-5");
if (isDalle3) { if (isDalle3) {
const prompt = getMessageTextContent( const prompt = getMessageTextContent(
options.messages.slice(-1)?.pop() as any, options.messages.slice(-1)?.pop() as any,
@ -230,7 +231,7 @@ export class ChatGPTApi implements LLMApi {
messages, messages,
stream: options.config.stream, stream: options.config.stream,
model: modelConfig.model, model: modelConfig.model,
temperature: !isO1OrO3 ? modelConfig.temperature : 1, temperature: (!isO1OrO3 && !isGpt5) ? modelConfig.temperature : 1,
presence_penalty: !isO1OrO3 ? modelConfig.presence_penalty : 0, presence_penalty: !isO1OrO3 ? modelConfig.presence_penalty : 0,
frequency_penalty: !isO1OrO3 ? modelConfig.frequency_penalty : 0, frequency_penalty: !isO1OrO3 ? modelConfig.frequency_penalty : 0,
top_p: !isO1OrO3 ? modelConfig.top_p : 1, top_p: !isO1OrO3 ? modelConfig.top_p : 1,
@ -238,7 +239,13 @@ export class ChatGPTApi implements LLMApi {
// Please do not ask me why not send max_tokens, no reason, this param is just shit, I dont want to explain anymore. // Please do not ask me why not send max_tokens, no reason, this param is just shit, I dont want to explain anymore.
}; };
if (isO1OrO3) { if (isGpt5) {
// Remove max_tokens if present
delete requestPayload.max_tokens;
// Add max_completion_tokens (or max_completion_tokens if that's what you meant)
requestPayload["max_completion_tokens"] = modelConfig.max_tokens;
} else if (isO1OrO3) {
// by default the o1/o3 models will not attempt to produce output that includes markdown formatting // by default the o1/o3 models will not attempt to produce output that includes markdown formatting
// manually add "Formatting re-enabled" developer message to encourage markdown inclusion in model responses // manually add "Formatting re-enabled" developer message to encourage markdown inclusion in model responses
// (https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/reasoning?tabs=python-secure#markdown-output) // (https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/reasoning?tabs=python-secure#markdown-output)
@ -251,8 +258,9 @@ export class ChatGPTApi implements LLMApi {
requestPayload["max_completion_tokens"] = modelConfig.max_tokens; requestPayload["max_completion_tokens"] = modelConfig.max_tokens;
} }
// add max_tokens to vision model // add max_tokens to vision model
if (visionModel && !isO1OrO3) { if (visionModel && !isO1OrO3 && ! isGpt5) {
requestPayload["max_tokens"] = Math.max(modelConfig.max_tokens, 4000); requestPayload["max_tokens"] = Math.max(modelConfig.max_tokens, 4000);
} }
} }

View File

@ -1340,12 +1340,15 @@ function _Chat() {
}); });
try { try {
for await (const chunk of api.llm.streamSpeech({ for await (const chunk of api.llm.streamSpeech(
model: config.ttsConfig.model, {
input: textContent, model: config.ttsConfig.model,
voice: config.ttsConfig.voice, input: textContent,
speed: config.ttsConfig.speed, voice: config.ttsConfig.voice,
})) { speed: config.ttsConfig.speed,
},
ttsPlayer,
)) {
ttsPlayer.addToQueue(chunk); ttsPlayer.addToQueue(chunk);
} }
ttsPlayer.finishStreamPlay(); ttsPlayer.finishStreamPlay();

View File

@ -524,6 +524,7 @@ export const VISION_MODEL_REGEXES = [
/o3/, /o3/,
/o4-mini/, /o4-mini/,
/grok-4/i, /grok-4/i,
/gpt-5/
]; ];
export const EXCLUDE_VISION_MODEL_REGEXES = [/claude-3-5-haiku-20241022/]; export const EXCLUDE_VISION_MODEL_REGEXES = [/claude-3-5-haiku-20241022/];
@ -548,6 +549,11 @@ const openaiModels = [
"gpt-4.1-nano-2025-04-14", "gpt-4.1-nano-2025-04-14",
"gpt-4.5-preview", "gpt-4.5-preview",
"gpt-4.5-preview-2025-02-27", "gpt-4.5-preview-2025-02-27",
"gpt-5-chat",
"gpt-5-mini",
"gpt-5-nano",
"gpt-5",
"gpt-5-chat-2025-01-01-preview",
"gpt-4o", "gpt-4o",
"gpt-4o-2024-05-13", "gpt-4o-2024-05-13",
"gpt-4o-2024-08-06", "gpt-4o-2024-08-06",

View File

@ -4,157 +4,287 @@ type TTSPlayer = {
audioBuffer: ArrayBuffer | AudioBuffer, audioBuffer: ArrayBuffer | AudioBuffer,
onended: () => void | null, onended: () => void | null,
) => Promise<void>; ) => Promise<void>;
playQueue: ( playQueueMethod: (
audioBuffers: (ArrayBuffer | AudioBuffer)[], audioBuffers: (ArrayBuffer | AudioBuffer)[],
onended: () => void | null, onended: () => void | null,
) => Promise<void>; ) => Promise<void>;
addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void; addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
startStreamPlay: (onended: () => void | null) => void; startStreamPlay: (onended: () => void | null) => void;
finishStreamPlay: () => void; finishStreamPlay: () => void;
setStreamController: (controller: AbortController) => void;
clearStreamController: () => void;
stop: () => void; stop: () => void;
pcmBase64ToAudioBuffer: (
base64Data: string,
config?: PCMConfig,
) => Promise<AudioBuffer>;
pcmDataToAudioBuffer: (
pcmData: Uint8Array,
config?: PCMConfig,
) => Promise<AudioBuffer>;
}; };
export function createTTSPlayer(): TTSPlayer { // Audio processing utilities
let audioContext: AudioContext | null = null; export interface PCMConfig {
let audioBufferSourceNode: AudioBufferSourceNode | null = null; channels?: number;
let isPlaying = false; sampleRate?: number;
let playQueue: (ArrayBuffer | AudioBuffer)[] = []; bitDepth?: 16 | 24 | 32;
let currentOnended: (() => void | null) | null = null; }
let isStreamMode = false;
let streamFinished = false;
const init = () => { export class TTSPlayManager implements TTSPlayer {
console.log("[TTSPlayer] init"); private static audioContext: AudioContext | null = null;
audioContext = new (window.AudioContext || window.webkitAudioContext)(); private audioBufferSourceNode: AudioBufferSourceNode | null = null;
audioContext.suspend(); private isPlaying = false;
}; private playQueue: (ArrayBuffer | AudioBuffer)[] = [];
private currentOnended: (() => void | null) | null = null;
private isStreamMode = false;
private streamFinished = false;
private streamController: AbortController | null = null;
const play = async ( get getAudioContext() {
if (!TTSPlayManager.audioContext) {
TTSPlayManager.audioContext = new (window.AudioContext ||
window.webkitAudioContext)();
}
return TTSPlayManager.audioContext;
}
init() {
console.log("[TTSPlayManager] init");
if (TTSPlayManager.audioContext) {
return;
}
this.getAudioContext.suspend();
}
async play(
audioBuffer: ArrayBuffer | AudioBuffer, audioBuffer: ArrayBuffer | AudioBuffer,
onended: () => void | null, onended: () => void | null,
) => { ) {
if (audioBufferSourceNode) { if (this.audioBufferSourceNode) {
audioBufferSourceNode.stop(); this.audioBufferSourceNode.stop();
audioBufferSourceNode.disconnect(); this.audioBufferSourceNode.disconnect();
} }
let buffer: AudioBuffer; let buffer: AudioBuffer;
if (audioBuffer instanceof AudioBuffer) { if (audioBuffer instanceof AudioBuffer) {
buffer = audioBuffer; buffer = audioBuffer;
} else { } else {
buffer = await audioContext!.decodeAudioData(audioBuffer); buffer = await TTSPlayManager.audioContext!.decodeAudioData(audioBuffer);
} }
audioBufferSourceNode = audioContext!.createBufferSource(); this.audioBufferSourceNode =
audioBufferSourceNode.buffer = buffer; TTSPlayManager.audioContext!.createBufferSource();
audioBufferSourceNode.connect(audioContext!.destination); this.audioBufferSourceNode.buffer = buffer;
audioContext!.resume().then(() => { this.audioBufferSourceNode.connect(
audioBufferSourceNode!.start(); TTSPlayManager.audioContext!.destination,
);
this.getAudioContext.resume().then(() => {
this.audioBufferSourceNode!.start();
}); });
audioBufferSourceNode.onended = onended; this.audioBufferSourceNode.onended = onended;
}; }
const playNext = async () => { async stop() {
if (playQueue.length === 0) { console.log("[TTSPlayer] stop");
// 首先中断流式请求
try {
if (this.streamController && !this.streamController.signal.aborted) {
console.log("[TTSPlayer] Aborting stream request");
this.streamController.abort();
}
} catch (e) {
// 忽略中断请求时的错误
console.log("[TTSPlayer] Error while aborting stream:", e);
}
this.clearStreamController();
// 清理播放状态
this.playQueue = [];
this.isPlaying = false;
this.isStreamMode = false;
this.streamFinished = true;
this.currentOnended = null;
// 停止音频播放
if (this.audioBufferSourceNode) {
this.audioBufferSourceNode.stop();
this.audioBufferSourceNode.disconnect();
this.audioBufferSourceNode = null;
}
// 关闭音频上下文
if (TTSPlayManager.audioContext) {
await TTSPlayManager.audioContext.close();
TTSPlayManager.audioContext = null;
}
}
async playNext() {
if (this.playQueue.length === 0) {
// 在流模式下,如果队列为空但流还没结束,等待 // 在流模式下,如果队列为空但流还没结束,等待
if (isStreamMode && !streamFinished) { if (this.isStreamMode && !this.streamFinished) {
setTimeout(() => playNext(), 100); setTimeout(() => this.playNext(), 100);
return; return;
} }
isPlaying = false; this.isPlaying = false;
isStreamMode = false; this.isStreamMode = false;
streamFinished = false; this.streamFinished = false;
if (currentOnended) { if (this.currentOnended) {
currentOnended(); this.currentOnended();
currentOnended = null; this.currentOnended = null;
} }
return; return;
} }
const nextBuffer = playQueue.shift()!; const nextBuffer = this.playQueue.shift()!;
let buffer: AudioBuffer; let buffer: AudioBuffer;
if (nextBuffer instanceof AudioBuffer) { if (nextBuffer instanceof AudioBuffer) {
buffer = nextBuffer; buffer = nextBuffer;
} else { } else {
buffer = await audioContext!.decodeAudioData(nextBuffer); buffer = await this.getAudioContext.decodeAudioData(nextBuffer);
} }
if (audioBufferSourceNode) { if (this.audioBufferSourceNode) {
audioBufferSourceNode.stop(); this.audioBufferSourceNode.stop();
audioBufferSourceNode.disconnect(); this.audioBufferSourceNode.disconnect();
} }
audioBufferSourceNode = audioContext!.createBufferSource(); this.audioBufferSourceNode = this.getAudioContext.createBufferSource();
audioBufferSourceNode.buffer = buffer; this.audioBufferSourceNode.buffer = buffer;
audioBufferSourceNode.connect(audioContext!.destination); this.audioBufferSourceNode.connect(this.getAudioContext.destination);
audioBufferSourceNode.onended = () => { this.audioBufferSourceNode.onended = () => {
playNext(); this.playNext();
}; };
await audioContext!.resume(); await this.getAudioContext.resume();
audioBufferSourceNode.start(); this.audioBufferSourceNode.start();
}; }
const playQueueMethod = async ( async playQueueMethod(
audioBuffers: (ArrayBuffer | AudioBuffer)[], audioBuffers: (ArrayBuffer | AudioBuffer)[],
onended: () => void | null, onended: () => void | null,
) => { ) {
playQueue = [...audioBuffers]; this.playQueue = [...audioBuffers];
currentOnended = onended; this.currentOnended = onended;
if (!isPlaying) { if (!this.isPlaying) {
isPlaying = true; this.isPlaying = true;
await playNext(); await this.playNext();
} }
}; }
const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => { addToQueue(audioBuffer: ArrayBuffer | AudioBuffer) {
if (streamFinished) { if (this.streamFinished) {
return; return;
} }
playQueue.push(audioBuffer); this.playQueue.push(audioBuffer);
}; }
const startStreamPlay = (onended: () => void | null) => { startStreamPlay(onended: () => void | null) {
isStreamMode = true; this.isStreamMode = true;
streamFinished = false; this.streamFinished = false;
playQueue = []; this.playQueue = [];
currentOnended = onended; this.currentOnended = onended;
if (!this.isPlaying) {
if (!isPlaying) { this.isPlaying = true;
isPlaying = true; this.playNext();
playNext();
} }
}; }
const finishStreamPlay = () => { finishStreamPlay() {
streamFinished = true; this.streamFinished = true;
}; }
const stop = async () => { // 设置流式请求控制器,用于在 stop 时中断请求
console.log("[TTSPlayer] stop"); setStreamController(controller: AbortController) {
playQueue = []; this.streamController = controller;
isPlaying = false; }
isStreamMode = false;
streamFinished = true;
currentOnended = null;
if (audioBufferSourceNode) { // 清除流式请求控制器
audioBufferSourceNode.stop(); clearStreamController() {
audioBufferSourceNode.disconnect(); this.streamController = null;
audioBufferSourceNode = null; }
// 将 base64 PCM 数据转换为 AudioBuffer
async pcmBase64ToAudioBuffer(
base64Data: string,
config: PCMConfig = {},
): Promise<AudioBuffer> {
try {
// 解码 base64
const binaryString = atob(base64Data);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
// 转换为 AudioBuffer
return await this.pcmDataToAudioBuffer(bytes, config);
} catch (error) {
console.error("Failed to convert PCM base64 to AudioBuffer:", error);
throw error;
} }
if (audioContext) { }
await audioContext.close();
audioContext = null;
}
};
return { // 将 PCM 字节数据转换为 AudioBuffer
init, async pcmDataToAudioBuffer(
play, pcmData: Uint8Array,
playQueue: playQueueMethod, config: PCMConfig = {},
addToQueue, ): Promise<AudioBuffer> {
startStreamPlay, const { channels = 1, sampleRate = 24000, bitDepth = 16 } = config;
finishStreamPlay,
stop, const audioContext = this.getAudioContext;
};
return new Promise<AudioBuffer>((resolve, reject) => {
try {
let float32Array: Float32Array;
// 根据位深度选择转换方法
switch (bitDepth) {
case 16:
float32Array = this.pcm16ToFloat32(pcmData);
break;
default:
throw new Error(`Unsupported bit depth: ${bitDepth}`);
}
// 创建 AudioBuffer
const audioBuffer = audioContext.createBuffer(
channels,
float32Array.length / channels,
sampleRate,
);
// 复制数据到 AudioBuffer
for (let channel = 0; channel < channels; channel++) {
const channelData = audioBuffer.getChannelData(channel);
for (let i = 0; i < channelData.length; i++) {
channelData[i] = float32Array[i * channels + channel];
}
}
resolve(audioBuffer);
} catch (error) {
reject(error);
}
});
}
// 16位 PCM 转 32位浮点数
pcm16ToFloat32(pcmData: Uint8Array): Float32Array {
const length = pcmData.length / 2;
const float32Array = new Float32Array(length);
for (let i = 0; i < length; i++) {
const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
float32Array[i] = int16Signed / 32768;
}
return float32Array;
}
}
export function createTTSPlayer(): TTSPlayManager {
return new TTSPlayManager();
} }