mirror of
https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web.git
synced 2025-09-19 17:56:39 +08:00
feat: 更新语音合成接口,支持流式播放和多种音频格式
This commit is contained in:
parent
9990a89698
commit
c5e6b1278f
@ -107,7 +107,8 @@ export interface LLMModelProvider {
|
||||
|
||||
export abstract class LLMApi {
|
||||
abstract chat(options: ChatOptions): Promise<void>;
|
||||
abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
|
||||
abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
|
||||
abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
|
||||
abstract usage(): Promise<LLMUsage>;
|
||||
abstract models(): Promise<LLMModel[]>;
|
||||
}
|
||||
|
@ -1,5 +1,10 @@
|
||||
"use client";
|
||||
import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant";
|
||||
import {
|
||||
ApiPath,
|
||||
Alibaba,
|
||||
ALIBABA_BASE_URL,
|
||||
REQUEST_TIMEOUT_MS,
|
||||
} from "@/app/constant";
|
||||
import {
|
||||
useAccessStore,
|
||||
useAppConfig,
|
||||
@ -89,7 +94,11 @@ export class QwenApi implements LLMApi {
|
||||
return res?.output?.choices?.at(0)?.message?.content ?? "";
|
||||
}
|
||||
|
||||
async speech(options: SpeechOptions): Promise<AudioBuffer> {
|
||||
async speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
|
||||
const requestPayload = {
|
||||
model: options.model,
|
||||
input: {
|
||||
@ -125,26 +134,27 @@ export class QwenApi implements LLMApi {
|
||||
const reader = res.body!.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = "";
|
||||
let base64 = "";
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
buffer += decoder.decode(value, { stream: true, });
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() || '';
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split("\n");
|
||||
buffer = lines.pop() || "";
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('data:')) {
|
||||
if (line.startsWith("data:")) {
|
||||
const data = line.slice(5);
|
||||
const json = JSON.parse(data);
|
||||
base64 += json.output.audio.data;
|
||||
if (json.output.audio.data) {
|
||||
yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const audioBuffer = await this.PCMBase64ToAudioBuffer(base64);
|
||||
clearTimeout(requestTimeoutId);
|
||||
reader.releaseLock();
|
||||
return audioBuffer;
|
||||
} catch (e) {
|
||||
console.log("[Request] failed to make a speech request", e);
|
||||
throw e;
|
||||
@ -347,14 +357,15 @@ export class QwenApi implements LLMApi {
|
||||
|
||||
return audioBuffer;
|
||||
} catch (error) {
|
||||
console.error('播放 PCM 数据失败:', error);
|
||||
console.error("播放 PCM 数据失败:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// 将 PCM 字节数据转换为 AudioBuffer
|
||||
private convertToAudioBuffer(pcmData: Uint8Array) {
|
||||
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
const audioContext = new (window.AudioContext ||
|
||||
window.webkitAudioContext)();
|
||||
const channels = 1;
|
||||
const sampleRate = 24000;
|
||||
return new Promise<AudioBuffer>((resolve, reject) => {
|
||||
@ -367,7 +378,7 @@ export class QwenApi implements LLMApi {
|
||||
const audioBuffer = audioContext.createBuffer(
|
||||
channels,
|
||||
float32Array.length / channels,
|
||||
sampleRate
|
||||
sampleRate,
|
||||
);
|
||||
|
||||
// 复制数据到 AudioBuffer
|
||||
|
@ -101,8 +101,6 @@ import {
|
||||
import { useNavigate } from "react-router-dom";
|
||||
import {
|
||||
CHAT_PAGE_SIZE,
|
||||
DEFAULT_TTS_ENGINE,
|
||||
ModelProvider,
|
||||
Path,
|
||||
REQUEST_TIMEOUT_MS,
|
||||
ServiceProvider,
|
||||
@ -1286,6 +1284,7 @@ function _Chat() {
|
||||
const accessStore = useAccessStore();
|
||||
const [speechStatus, setSpeechStatus] = useState(false);
|
||||
const [speechLoading, setSpeechLoading] = useState(false);
|
||||
const [speechCooldown, setSpeechCooldown] = useState(false);
|
||||
|
||||
async function openaiSpeech(text: string) {
|
||||
if (speechStatus) {
|
||||
@ -1297,10 +1296,10 @@ function _Chat() {
|
||||
api = new ClientApi(config.ttsConfig.modelProvider);
|
||||
setSpeechLoading(true);
|
||||
ttsPlayer.init();
|
||||
let audioBuffer: ArrayBuffer;
|
||||
let audioBuffer: ArrayBuffer | AudioBuffer;
|
||||
const { markdownToTxt } = require("markdown-to-txt");
|
||||
const textContent = markdownToTxt(text);
|
||||
console.log("[OpenAI Speech] textContent: ", config, textContent);
|
||||
console.log("[OpenAI Speech] textContent: ", textContent);
|
||||
if (config.ttsConfig.engine === "Edge") {
|
||||
const edgeVoiceName = accessStore.edgeVoiceName();
|
||||
const tts = new MsEdgeTTS();
|
||||
@ -1309,6 +1308,34 @@ function _Chat() {
|
||||
OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
|
||||
);
|
||||
audioBuffer = await tts.toArrayBuffer(textContent);
|
||||
playSpeech(audioBuffer);
|
||||
} else {
|
||||
if (api.llm.streamSpeech) {
|
||||
// 使用流式播放,边接收边播放
|
||||
setSpeechStatus(true);
|
||||
ttsPlayer.startStreamPlay(() => {
|
||||
setSpeechStatus(false);
|
||||
});
|
||||
|
||||
try {
|
||||
for await (const chunk of api.llm.streamSpeech({
|
||||
model: config.ttsConfig.model,
|
||||
input: textContent,
|
||||
voice: config.ttsConfig.voice,
|
||||
speed: config.ttsConfig.speed,
|
||||
})) {
|
||||
console.log("[Stream Speech] add to queue", chunk);
|
||||
ttsPlayer.addToQueue(chunk);
|
||||
}
|
||||
ttsPlayer.finishStreamPlay();
|
||||
} catch (e) {
|
||||
console.error("[Stream Speech]", e);
|
||||
showToast(prettyObject(e));
|
||||
setSpeechStatus(false);
|
||||
ttsPlayer.stop();
|
||||
} finally {
|
||||
setSpeechLoading(false);
|
||||
}
|
||||
} else {
|
||||
audioBuffer = await api.llm.speech({
|
||||
model: config.ttsConfig.model,
|
||||
@ -1316,7 +1343,13 @@ function _Chat() {
|
||||
voice: config.ttsConfig.voice,
|
||||
speed: config.ttsConfig.speed,
|
||||
});
|
||||
playSpeech(audioBuffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) {
|
||||
setSpeechStatus(true);
|
||||
ttsPlayer
|
||||
.play(audioBuffer, () => {
|
||||
@ -1329,7 +1362,6 @@ function _Chat() {
|
||||
})
|
||||
.finally(() => setSpeechLoading(false));
|
||||
}
|
||||
}
|
||||
|
||||
const context: RenderMessage[] = useMemo(() => {
|
||||
return session.mask.hideContext ? [] : session.mask.context.slice();
|
||||
|
@ -1,19 +1,38 @@
|
||||
type TTSPlayer = {
|
||||
init: () => void;
|
||||
play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
|
||||
play: (
|
||||
audioBuffer: ArrayBuffer | AudioBuffer,
|
||||
onended: () => void | null,
|
||||
) => Promise<void>;
|
||||
playQueue: (
|
||||
audioBuffers: (ArrayBuffer | AudioBuffer)[],
|
||||
onended: () => void | null,
|
||||
) => Promise<void>;
|
||||
addToQueue: (audioBuffer: ArrayBuffer | AudioBuffer) => void;
|
||||
startStreamPlay: (onended: () => void | null) => void;
|
||||
finishStreamPlay: () => void;
|
||||
stop: () => void;
|
||||
};
|
||||
|
||||
export function createTTSPlayer(): TTSPlayer {
|
||||
let audioContext: AudioContext | null = null;
|
||||
let audioBufferSourceNode: AudioBufferSourceNode | null = null;
|
||||
let isPlaying = false;
|
||||
let playQueue: (ArrayBuffer | AudioBuffer)[] = [];
|
||||
let currentOnended: (() => void | null) | null = null;
|
||||
let isStreamMode = false;
|
||||
let streamFinished = false;
|
||||
|
||||
const init = () => {
|
||||
console.log("[TTSPlayer] init");
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
audioContext.suspend();
|
||||
};
|
||||
|
||||
const play = async (audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null) => {
|
||||
const play = async (
|
||||
audioBuffer: ArrayBuffer | AudioBuffer,
|
||||
onended: () => void | null,
|
||||
) => {
|
||||
if (audioBufferSourceNode) {
|
||||
audioBufferSourceNode.stop();
|
||||
audioBufferSourceNode.disconnect();
|
||||
@ -33,17 +52,109 @@ export function createTTSPlayer(): TTSPlayer {
|
||||
audioBufferSourceNode.onended = onended;
|
||||
};
|
||||
|
||||
const stop = () => {
|
||||
const playNext = async () => {
|
||||
if (playQueue.length === 0) {
|
||||
// 在流模式下,如果队列为空但流还没结束,等待
|
||||
if (isStreamMode && !streamFinished) {
|
||||
setTimeout(() => playNext(), 100);
|
||||
return;
|
||||
}
|
||||
|
||||
isPlaying = false;
|
||||
isStreamMode = false;
|
||||
streamFinished = false;
|
||||
if (currentOnended) {
|
||||
currentOnended();
|
||||
currentOnended = null;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const nextBuffer = playQueue.shift()!;
|
||||
let buffer: AudioBuffer;
|
||||
if (nextBuffer instanceof AudioBuffer) {
|
||||
buffer = nextBuffer;
|
||||
} else {
|
||||
buffer = await audioContext!.decodeAudioData(nextBuffer);
|
||||
}
|
||||
|
||||
if (audioBufferSourceNode) {
|
||||
audioBufferSourceNode.stop();
|
||||
audioBufferSourceNode.disconnect();
|
||||
}
|
||||
|
||||
audioBufferSourceNode = audioContext!.createBufferSource();
|
||||
audioBufferSourceNode.buffer = buffer;
|
||||
audioBufferSourceNode.connect(audioContext!.destination);
|
||||
audioBufferSourceNode.onended = () => {
|
||||
playNext();
|
||||
};
|
||||
|
||||
await audioContext!.resume();
|
||||
audioBufferSourceNode.start();
|
||||
};
|
||||
|
||||
const playQueueMethod = async (
|
||||
audioBuffers: (ArrayBuffer | AudioBuffer)[],
|
||||
onended: () => void | null,
|
||||
) => {
|
||||
playQueue = [...audioBuffers];
|
||||
currentOnended = onended;
|
||||
if (!isPlaying) {
|
||||
isPlaying = true;
|
||||
await playNext();
|
||||
}
|
||||
};
|
||||
|
||||
const addToQueue = (audioBuffer: ArrayBuffer | AudioBuffer) => {
|
||||
if (streamFinished) {
|
||||
return;
|
||||
}
|
||||
playQueue.push(audioBuffer);
|
||||
};
|
||||
|
||||
const startStreamPlay = (onended: () => void | null) => {
|
||||
isStreamMode = true;
|
||||
streamFinished = false;
|
||||
playQueue = [];
|
||||
currentOnended = onended;
|
||||
|
||||
if (!isPlaying) {
|
||||
isPlaying = true;
|
||||
playNext();
|
||||
}
|
||||
};
|
||||
|
||||
const finishStreamPlay = () => {
|
||||
streamFinished = true;
|
||||
};
|
||||
|
||||
const stop = async () => {
|
||||
console.log("[TTSPlayer] stop");
|
||||
playQueue = [];
|
||||
isPlaying = false;
|
||||
isStreamMode = false;
|
||||
streamFinished = true;
|
||||
currentOnended = null;
|
||||
|
||||
if (audioBufferSourceNode) {
|
||||
audioBufferSourceNode.stop();
|
||||
audioBufferSourceNode.disconnect();
|
||||
audioBufferSourceNode = null;
|
||||
}
|
||||
if (audioContext) {
|
||||
audioContext.close();
|
||||
await audioContext.close();
|
||||
audioContext = null;
|
||||
}
|
||||
};
|
||||
|
||||
return { init, play, stop };
|
||||
return {
|
||||
init,
|
||||
play,
|
||||
playQueue: playQueueMethod,
|
||||
addToQueue,
|
||||
startStreamPlay,
|
||||
finishStreamPlay,
|
||||
stop,
|
||||
};
|
||||
}
|
Loading…
Reference in New Issue
Block a user