feat: 更新语音合成接口,支持流式播放和多种音频格式

This commit is contained in:
EvanWu 2025-07-30 23:27:49 +08:00
parent 9990a89698
commit c5e6b1278f
4 changed files with 284 additions and 129 deletions

View File

@ -107,7 +107,8 @@ export interface LLMModelProvider {
export abstract class LLMApi { export abstract class LLMApi {
abstract chat(options: ChatOptions): Promise<void>; abstract chat(options: ChatOptions): Promise<void>;
abstract speech(options: SpeechOptions): Promise<ArrayBuffer>; abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
abstract usage(): Promise<LLMUsage>; abstract usage(): Promise<LLMUsage>;
abstract models(): Promise<LLMModel[]>; abstract models(): Promise<LLMModel[]>;
} }

View File

@ -1,5 +1,10 @@
"use client"; "use client";
import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant"; import {
ApiPath,
Alibaba,
ALIBABA_BASE_URL,
REQUEST_TIMEOUT_MS,
} from "@/app/constant";
import { import {
useAccessStore, useAccessStore,
useAppConfig, useAppConfig,
@ -89,66 +94,71 @@ export class QwenApi implements LLMApi {
return res?.output?.choices?.at(0)?.message?.content ?? ""; return res?.output?.choices?.at(0)?.message?.content ?? "";
} }
async speech(options: SpeechOptions): Promise<AudioBuffer> { async speech(options: SpeechOptions): Promise<ArrayBuffer> {
throw new Error("Method not implemented.");
}
async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
const requestPayload = { const requestPayload = {
model: options.model, model: options.model,
input: { input: {
text: options.input, text: options.input,
voice: options.voice, voice: options.voice,
}, },
speed: options.speed, speed: options.speed,
response_format: options.response_format, response_format: options.response_format,
}; };
console.log("[Request] alibaba speech payload: ", requestPayload); console.log("[Request] alibaba speech payload: ", requestPayload);
const controller = new AbortController(); const controller = new AbortController();
options.onController?.(controller); options.onController?.(controller);
try { try {
const speechPath = this.path(Alibaba.SpeechPath); const speechPath = this.path(Alibaba.SpeechPath);
const speechPayload = { const speechPayload = {
method: "POST", method: "POST",
body: JSON.stringify(requestPayload), body: JSON.stringify(requestPayload),
signal: controller.signal, signal: controller.signal,
headers: { headers: {
...getHeaders(), ...getHeaders(),
"X-DashScope-SSE": "enable", "X-DashScope-SSE": "enable",
}, },
}; };
// make a fetch request
const requestTimeoutId = setTimeout(
() => controller.abort(),
REQUEST_TIMEOUT_MS,
);
const res = await fetch(speechPath, speechPayload);
const reader = res.body!.getReader(); // make a fetch request
const decoder = new TextDecoder(); const requestTimeoutId = setTimeout(
let buffer = ""; () => controller.abort(),
let base64 = ""; REQUEST_TIMEOUT_MS,
while (true) { );
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true, });
const lines = buffer.split('\n');
buffer = lines.pop() || '';
for (const line of lines) { const res = await fetch(speechPath, speechPayload);
if (line.startsWith('data:')) {
const data = line.slice(5); const reader = res.body!.getReader();
const json = JSON.parse(data); const decoder = new TextDecoder();
base64 += json.output.audio.data; let buffer = "";
} while (true) {
} const { done, value } = await reader.read();
if (done) {
break;
}
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop() || "";
for (const line of lines) {
if (line.startsWith("data:")) {
const data = line.slice(5);
const json = JSON.parse(data);
if (json.output.audio.data) {
yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
}
}
} }
const audioBuffer = await this.PCMBase64ToAudioBuffer(base64);
clearTimeout(requestTimeoutId);
reader.releaseLock();
return audioBuffer;
} catch (e) {
console.log("[Request] failed to make a speech request", e);
throw e;
} }
clearTimeout(requestTimeoutId);
reader.releaseLock();
} catch (e) {
console.log("[Request] failed to make a speech request", e);
throw e;
}
} }
async chat(options: ChatOptions) { async chat(options: ChatOptions) {
@ -335,67 +345,68 @@ export class QwenApi implements LLMApi {
// 播放 PCM base64 数据 // 播放 PCM base64 数据
private async PCMBase64ToAudioBuffer(base64Data: string) { private async PCMBase64ToAudioBuffer(base64Data: string) {
try { try {
// 解码 base64 // 解码 base64
const binaryString = atob(base64Data); const binaryString = atob(base64Data);
const bytes = new Uint8Array(binaryString.length); const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) { for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i); bytes[i] = binaryString.charCodeAt(i);
} }
// 转换为 AudioBuffer // 转换为 AudioBuffer
const audioBuffer = await this.convertToAudioBuffer(bytes); const audioBuffer = await this.convertToAudioBuffer(bytes);
return audioBuffer; return audioBuffer;
} catch (error) { } catch (error) {
console.error('播放 PCM 数据失败:', error); console.error("播放 PCM 数据失败:", error);
throw error; throw error;
} }
} }
// 将 PCM 字节数据转换为 AudioBuffer // 将 PCM 字节数据转换为 AudioBuffer
private convertToAudioBuffer(pcmData: Uint8Array) { private convertToAudioBuffer(pcmData: Uint8Array) {
const audioContext = new (window.AudioContext || window.webkitAudioContext)(); const audioContext = new (window.AudioContext ||
window.webkitAudioContext)();
const channels = 1; const channels = 1;
const sampleRate = 24000; const sampleRate = 24000;
return new Promise<AudioBuffer>((resolve, reject) => { return new Promise<AudioBuffer>((resolve, reject) => {
try { try {
let float32Array; let float32Array;
// 16位 PCM 转换为 32位浮点数 // 16位 PCM 转换为 32位浮点数
float32Array = this.pcm16ToFloat32(pcmData); float32Array = this.pcm16ToFloat32(pcmData);
// 创建 AudioBuffer // 创建 AudioBuffer
const audioBuffer = audioContext.createBuffer( const audioBuffer = audioContext.createBuffer(
channels, channels,
float32Array.length / channels, float32Array.length / channels,
sampleRate sampleRate,
); );
// 复制数据到 AudioBuffer // 复制数据到 AudioBuffer
for (let channel = 0; channel < channels; channel++) { for (let channel = 0; channel < channels; channel++) {
const channelData = audioBuffer.getChannelData(channel); const channelData = audioBuffer.getChannelData(channel);
for (let i = 0; i < channelData.length; i++) { for (let i = 0; i < channelData.length; i++) {
channelData[i] = float32Array[i * channels + channel]; channelData[i] = float32Array[i * channels + channel];
} }
}
resolve(audioBuffer);
} catch (error) {
reject(error);
} }
resolve(audioBuffer);
} catch (error) {
reject(error);
}
}); });
} }
// 16位 PCM 转 32位浮点数 // 16位 PCM 转 32位浮点数
private pcm16ToFloat32(pcmData: Uint8Array) { private pcm16ToFloat32(pcmData: Uint8Array) {
const length = pcmData.length / 2; const length = pcmData.length / 2;
const float32Array = new Float32Array(length); const float32Array = new Float32Array(length);
for (let i = 0; i < length; i++) { for (let i = 0; i < length; i++) {
const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2]; const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
const int16Signed = int16 > 32767 ? int16 - 65536 : int16; const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
float32Array[i] = int16Signed / 32768; float32Array[i] = int16Signed / 32768;
}
return float32Array;
} }
return float32Array;
}
} }
export { Alibaba }; export { Alibaba };

View File

@ -101,8 +101,6 @@ import {
import { useNavigate } from "react-router-dom"; import { useNavigate } from "react-router-dom";
import { import {
CHAT_PAGE_SIZE, CHAT_PAGE_SIZE,
DEFAULT_TTS_ENGINE,
ModelProvider,
Path, Path,
REQUEST_TIMEOUT_MS, REQUEST_TIMEOUT_MS,
ServiceProvider, ServiceProvider,
@ -1286,6 +1284,7 @@ function _Chat() {
const accessStore = useAccessStore(); const accessStore = useAccessStore();
const [speechStatus, setSpeechStatus] = useState(false); const [speechStatus, setSpeechStatus] = useState(false);
const [speechLoading, setSpeechLoading] = useState(false); const [speechLoading, setSpeechLoading] = useState(false);
const [speechCooldown, setSpeechCooldown] = useState(false);
async function openaiSpeech(text: string) { async function openaiSpeech(text: string) {
if (speechStatus) { if (speechStatus) {
@ -1297,10 +1296,10 @@ function _Chat() {
api = new ClientApi(config.ttsConfig.modelProvider); api = new ClientApi(config.ttsConfig.modelProvider);
setSpeechLoading(true); setSpeechLoading(true);
ttsPlayer.init(); ttsPlayer.init();
let audioBuffer: ArrayBuffer; let audioBuffer: ArrayBuffer | AudioBuffer;
const { markdownToTxt } = require("markdown-to-txt"); const { markdownToTxt } = require("markdown-to-txt");
const textContent = markdownToTxt(text); const textContent = markdownToTxt(text);
console.log("[OpenAI Speech] textContent: ", config, textContent); console.log("[OpenAI Speech] textContent: ", textContent);
if (config.ttsConfig.engine === "Edge") { if (config.ttsConfig.engine === "Edge") {
const edgeVoiceName = accessStore.edgeVoiceName(); const edgeVoiceName = accessStore.edgeVoiceName();
const tts = new MsEdgeTTS(); const tts = new MsEdgeTTS();
@ -1309,28 +1308,61 @@ function _Chat() {
OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3, OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
); );
audioBuffer = await tts.toArrayBuffer(textContent); audioBuffer = await tts.toArrayBuffer(textContent);
playSpeech(audioBuffer);
} else { } else {
audioBuffer = await api.llm.speech({ if (api.llm.streamSpeech) {
model: config.ttsConfig.model, // 使用流式播放,边接收边播放
input: textContent, setSpeechStatus(true);
voice: config.ttsConfig.voice, ttsPlayer.startStreamPlay(() => {
speed: config.ttsConfig.speed, setSpeechStatus(false);
}); });
try {
for await (const chunk of api.llm.streamSpeech({
model: config.ttsConfig.model,
input: textContent,
voice: config.ttsConfig.voice,
speed: config.ttsConfig.speed,
})) {
console.log("[Stream Speech] add to queue", chunk);
ttsPlayer.addToQueue(chunk);
}
ttsPlayer.finishStreamPlay();
} catch (e) {
console.error("[Stream Speech]", e);
showToast(prettyObject(e));
setSpeechStatus(false);
ttsPlayer.stop();
} finally {
setSpeechLoading(false);
}
} else {
audioBuffer = await api.llm.speech({
model: config.ttsConfig.model,
input: textContent,
voice: config.ttsConfig.voice,
speed: config.ttsConfig.speed,
});
playSpeech(audioBuffer);
}
} }
setSpeechStatus(true);
ttsPlayer
.play(audioBuffer, () => {
setSpeechStatus(false);
})
.catch((e) => {
console.error("[OpenAI Speech]", e);
showToast(prettyObject(e));
setSpeechStatus(false);
})
.finally(() => setSpeechLoading(false));
} }
} }
function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) {
setSpeechStatus(true);
ttsPlayer
.play(audioBuffer, () => {
setSpeechStatus(false);
})
.catch((e) => {
console.error("[OpenAI Speech]", e);
showToast(prettyObject(e));
setSpeechStatus(false);
})
.finally(() => setSpeechLoading(false));
}
const context: RenderMessage[] = useMemo(() => { const context: RenderMessage[] = useMemo(() => {
return session.mask.hideContext ? [] : session.mask.context.slice(); return session.mask.hideContext ? [] : session.mask.context.slice();
}, [session.mask.context, session.mask.hideContext]); }, [session.mask.context, session.mask.hideContext]);

View File

@ -1,19 +1,38 @@
type TTSPlayer = { type TTSPlayer = {
init: () => void; init: () => void;
play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>; play: (
audioBuffer: ArrayBuffer | AudioBuffer,
onended: () => void | null,
) => Promise<void>;
playQueue: (
audioBuffers: (ArrayBuffer | AudioBuffer)[],
onended: () => void | null,
) => Promise<void>;
addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
startStreamPlay: (onended: () => void | null) => void;
finishStreamPlay: () => void;
stop: () => void; stop: () => void;
}; };
export function createTTSPlayer(): TTSPlayer { export function createTTSPlayer(): TTSPlayer {
let audioContext: AudioContext | null = null; let audioContext: AudioContext | null = null;
let audioBufferSourceNode: AudioBufferSourceNode | null = null; let audioBufferSourceNode: AudioBufferSourceNode | null = null;
let isPlaying = false;
let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
let currentOnended: (() => void | null) | null = null;
let isStreamMode = false;
let streamFinished = false;
const init = () => { const init = () => {
console.log("[TTSPlayer] init");
audioContext = new (window.AudioContext || window.webkitAudioContext)(); audioContext = new (window.AudioContext || window.webkitAudioContext)();
audioContext.suspend(); audioContext.suspend();
}; };
const play = async (audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null) => { const play = async (
audioBuffer: ArrayBuffer | AudioBuffer,
onended: () => void | null,
) => {
if (audioBufferSourceNode) { if (audioBufferSourceNode) {
audioBufferSourceNode.stop(); audioBufferSourceNode.stop();
audioBufferSourceNode.disconnect(); audioBufferSourceNode.disconnect();
@ -33,17 +52,109 @@ export function createTTSPlayer(): TTSPlayer {
audioBufferSourceNode.onended = onended; audioBufferSourceNode.onended = onended;
}; };
const stop = () => { const playNext = async () => {
if (playQueue.length === 0) {
// 在流模式下,如果队列为空但流还没结束,等待
if (isStreamMode && !streamFinished) {
setTimeout(() => playNext(), 100);
return;
}
isPlaying = false;
isStreamMode = false;
streamFinished = false;
if (currentOnended) {
currentOnended();
currentOnended = null;
}
return;
}
const nextBuffer = playQueue.shift()!;
let buffer: AudioBuffer;
if (nextBuffer instanceof AudioBuffer) {
buffer = nextBuffer;
} else {
buffer = await audioContext!.decodeAudioData(nextBuffer);
}
if (audioBufferSourceNode) {
audioBufferSourceNode.stop();
audioBufferSourceNode.disconnect();
}
audioBufferSourceNode = audioContext!.createBufferSource();
audioBufferSourceNode.buffer = buffer;
audioBufferSourceNode.connect(audioContext!.destination);
audioBufferSourceNode.onended = () => {
playNext();
};
await audioContext!.resume();
audioBufferSourceNode.start();
};
const playQueueMethod = async (
audioBuffers: (ArrayBuffer | AudioBuffer)[],
onended: () => void | null,
) => {
playQueue = [...audioBuffers];
currentOnended = onended;
if (!isPlaying) {
isPlaying = true;
await playNext();
}
};
const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
if (streamFinished) {
return;
}
playQueue.push(audioBuffer);
};
const startStreamPlay = (onended: () => void | null) => {
isStreamMode = true;
streamFinished = false;
playQueue = [];
currentOnended = onended;
if (!isPlaying) {
isPlaying = true;
playNext();
}
};
const finishStreamPlay = () => {
streamFinished = true;
};
const stop = async () => {
console.log("[TTSPlayer] stop");
playQueue = [];
isPlaying = false;
isStreamMode = false;
streamFinished = true;
currentOnended = null;
if (audioBufferSourceNode) { if (audioBufferSourceNode) {
audioBufferSourceNode.stop(); audioBufferSourceNode.stop();
audioBufferSourceNode.disconnect(); audioBufferSourceNode.disconnect();
audioBufferSourceNode = null; audioBufferSourceNode = null;
} }
if (audioContext) { if (audioContext) {
audioContext.close(); await audioContext.close();
audioContext = null; audioContext = null;
} }
}; };
return { init, play, stop }; return {
} init,
play,
playQueue: playQueueMethod,
addToQueue,
startStreamPlay,
finishStreamPlay,
stop,
};
}