mirror of
https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web.git
synced 2025-09-21 18:56:37 +08:00
feat: 添加 TTS 引擎配置,更新阿里巴巴语音接口,支持实时语音合成
This commit is contained in:
parent
557a2cce35
commit
9990a89698
1
.yarnrc.yml
Normal file
1
.yarnrc.yml
Normal file
@ -0,0 +1 @@
|
||||
nodeLinker: node-modules
|
@ -1,5 +1,5 @@
|
||||
"use client";
|
||||
import { ApiPath, Alibaba, ALIBABA_BASE_URL } from "@/app/constant";
|
||||
import { ApiPath, Alibaba, ALIBABA_BASE_URL, REQUEST_TIMEOUT_MS } from "@/app/constant";
|
||||
import {
|
||||
useAccessStore,
|
||||
useAppConfig,
|
||||
@ -89,8 +89,66 @@ export class QwenApi implements LLMApi {
|
||||
return res?.output?.choices?.at(0)?.message?.content ?? "";
|
||||
}
|
||||
|
||||
speech(options: SpeechOptions): Promise<ArrayBuffer> {
|
||||
throw new Error("Method not implemented.");
|
||||
async speech(options: SpeechOptions): Promise<AudioBuffer> {
|
||||
const requestPayload = {
|
||||
model: options.model,
|
||||
input: {
|
||||
text: options.input,
|
||||
voice: options.voice,
|
||||
},
|
||||
speed: options.speed,
|
||||
response_format: options.response_format,
|
||||
};
|
||||
console.log("[Request] alibaba speech payload: ", requestPayload);
|
||||
const controller = new AbortController();
|
||||
options.onController?.(controller);
|
||||
try {
|
||||
const speechPath = this.path(Alibaba.SpeechPath);
|
||||
const speechPayload = {
|
||||
method: "POST",
|
||||
body: JSON.stringify(requestPayload),
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
...getHeaders(),
|
||||
"X-DashScope-SSE": "enable",
|
||||
},
|
||||
};
|
||||
|
||||
// make a fetch request
|
||||
const requestTimeoutId = setTimeout(
|
||||
() => controller.abort(),
|
||||
REQUEST_TIMEOUT_MS,
|
||||
);
|
||||
|
||||
const res = await fetch(speechPath, speechPayload);
|
||||
|
||||
const reader = res.body!.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = "";
|
||||
let base64 = "";
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
buffer += decoder.decode(value, { stream: true, });
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() || '';
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('data:')) {
|
||||
const data = line.slice(5);
|
||||
const json = JSON.parse(data);
|
||||
base64 += json.output.audio.data;
|
||||
}
|
||||
}
|
||||
}
|
||||
const audioBuffer = await this.PCMBase64ToAudioBuffer(base64);
|
||||
clearTimeout(requestTimeoutId);
|
||||
reader.releaseLock();
|
||||
return audioBuffer;
|
||||
} catch (e) {
|
||||
console.log("[Request] failed to make a speech request", e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
async chat(options: ChatOptions) {
|
||||
@ -273,5 +331,71 @@ export class QwenApi implements LLMApi {
|
||||
async models(): Promise<LLMModel[]> {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 播放 PCM base64 数据
|
||||
private async PCMBase64ToAudioBuffer(base64Data: string) {
|
||||
try {
|
||||
// 解码 base64
|
||||
const binaryString = atob(base64Data);
|
||||
const bytes = new Uint8Array(binaryString.length);
|
||||
for (let i = 0; i < binaryString.length; i++) {
|
||||
bytes[i] = binaryString.charCodeAt(i);
|
||||
}
|
||||
|
||||
// 转换为 AudioBuffer
|
||||
const audioBuffer = await this.convertToAudioBuffer(bytes);
|
||||
|
||||
return audioBuffer;
|
||||
} catch (error) {
|
||||
console.error('播放 PCM 数据失败:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// 将 PCM 字节数据转换为 AudioBuffer
|
||||
private convertToAudioBuffer(pcmData: Uint8Array) {
|
||||
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
const channels = 1;
|
||||
const sampleRate = 24000;
|
||||
return new Promise<AudioBuffer>((resolve, reject) => {
|
||||
try {
|
||||
let float32Array;
|
||||
// 16位 PCM 转换为 32位浮点数
|
||||
float32Array = this.pcm16ToFloat32(pcmData);
|
||||
|
||||
// 创建 AudioBuffer
|
||||
const audioBuffer = audioContext.createBuffer(
|
||||
channels,
|
||||
float32Array.length / channels,
|
||||
sampleRate
|
||||
);
|
||||
|
||||
// 复制数据到 AudioBuffer
|
||||
for (let channel = 0; channel < channels; channel++) {
|
||||
const channelData = audioBuffer.getChannelData(channel);
|
||||
for (let i = 0; i < channelData.length; i++) {
|
||||
channelData[i] = float32Array[i * channels + channel];
|
||||
}
|
||||
}
|
||||
|
||||
resolve(audioBuffer);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
}
|
||||
// 16位 PCM 转 32位浮点数
|
||||
private pcm16ToFloat32(pcmData: Uint8Array) {
|
||||
const length = pcmData.length / 2;
|
||||
const float32Array = new Float32Array(length);
|
||||
|
||||
for (let i = 0; i < length; i++) {
|
||||
const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
|
||||
const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
|
||||
float32Array[i] = int16Signed / 32768;
|
||||
}
|
||||
|
||||
return float32Array;
|
||||
}
|
||||
}
|
||||
export { Alibaba };
|
@ -1293,14 +1293,15 @@ function _Chat() {
|
||||
setSpeechStatus(false);
|
||||
} else {
|
||||
var api: ClientApi;
|
||||
api = new ClientApi(ModelProvider.GPT);
|
||||
const config = useAppConfig.getState();
|
||||
api = new ClientApi(config.ttsConfig.modelProvider);
|
||||
setSpeechLoading(true);
|
||||
ttsPlayer.init();
|
||||
let audioBuffer: ArrayBuffer;
|
||||
const { markdownToTxt } = require("markdown-to-txt");
|
||||
const textContent = markdownToTxt(text);
|
||||
if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
|
||||
console.log("[OpenAI Speech] textContent: ", config, textContent);
|
||||
if (config.ttsConfig.engine === "Edge") {
|
||||
const edgeVoiceName = accessStore.edgeVoiceName();
|
||||
const tts = new MsEdgeTTS();
|
||||
await tts.setMetadata(
|
||||
|
@ -5,13 +5,13 @@ import { ListItem, Select, PasswordInput } from "@/app/components/ui-lib";
|
||||
|
||||
import { InputRange } from "@/app/components/input-range";
|
||||
import { Voice } from "rt-client";
|
||||
import { ServiceProvider } from "@/app/constant";
|
||||
import { REALTIME_TTS_MODELS, ServiceProvider } from "@/app/constant";
|
||||
|
||||
const providers = [ServiceProvider.OpenAI, ServiceProvider.Azure];
|
||||
const providers = Object.keys(REALTIME_TTS_MODELS) as ServiceProvider[];
|
||||
|
||||
const models = ["gpt-4o-realtime-preview-2024-10-01"];
|
||||
const models = ["gpt-4o-realtime-preview-2024-10-01", "qwen-tts-realtime"];
|
||||
|
||||
const voice = ["alloy", "shimmer", "echo"];
|
||||
const voice = ["alloy", "shimmer", "echo","Chelsie"];
|
||||
|
||||
export function RealtimeConfigList(props: {
|
||||
realtimeConfig: RealtimeConfig;
|
||||
|
@ -3,10 +3,9 @@ import { TTSConfig, TTSConfigValidator } from "../store";
|
||||
import Locale from "../locales";
|
||||
import { ListItem, Select } from "./ui-lib";
|
||||
import {
|
||||
DEFAULT_TTS_ENGINE,
|
||||
DEFAULT_TTS_ENGINES,
|
||||
DEFAULT_TTS_MODELS,
|
||||
DEFAULT_TTS_VOICES,
|
||||
ServiceProvider,
|
||||
TTS_CONFIGS,
|
||||
TTSEngineType
|
||||
} from "../constant";
|
||||
import { InputRange } from "./input-range";
|
||||
|
||||
@ -48,22 +47,33 @@ export function TTSConfigList(props: {
|
||||
<Select
|
||||
value={props.ttsConfig.engine}
|
||||
onChange={(e) => {
|
||||
const newEngine = e.currentTarget.value as TTSEngineType;
|
||||
props.updateConfig(
|
||||
(config) =>
|
||||
(config.engine = TTSConfigValidator.engine(
|
||||
e.currentTarget.value,
|
||||
)),
|
||||
(config) => {
|
||||
config.engine = TTSConfigValidator.engine(newEngine);
|
||||
const engineConfig = TTS_CONFIGS[newEngine];
|
||||
config.model = TTSConfigValidator.model(
|
||||
engineConfig.Model[0] || ""
|
||||
);
|
||||
config.voice = TTSConfigValidator.voice(
|
||||
engineConfig.Voices[0] || ""
|
||||
);
|
||||
config.modelProvider = TTSConfigValidator.modelProvider(
|
||||
engineConfig.ModelProvider
|
||||
);
|
||||
}
|
||||
);
|
||||
}}
|
||||
>
|
||||
{DEFAULT_TTS_ENGINES.map((v, i) => (
|
||||
{Object.keys(TTS_CONFIGS).map((v, i) => (
|
||||
<option value={v} key={i}>
|
||||
{v}
|
||||
{v}-TTS
|
||||
</option>
|
||||
))}
|
||||
</Select>
|
||||
</ListItem>
|
||||
{props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
|
||||
{(props.ttsConfig.engine === ServiceProvider.OpenAI ||
|
||||
props.ttsConfig.engine === ServiceProvider.Alibaba) && (
|
||||
<>
|
||||
<ListItem title={Locale.Settings.TTS.Model}>
|
||||
<Select
|
||||
@ -77,7 +87,7 @@ export function TTSConfigList(props: {
|
||||
);
|
||||
}}
|
||||
>
|
||||
{DEFAULT_TTS_MODELS.map((v, i) => (
|
||||
{TTS_CONFIGS[props.ttsConfig.engine]!.Model.map((v, i) => (
|
||||
<option value={v} key={i}>
|
||||
{v}
|
||||
</option>
|
||||
@ -99,7 +109,7 @@ export function TTSConfigList(props: {
|
||||
);
|
||||
}}
|
||||
>
|
||||
{DEFAULT_TTS_VOICES.map((v, i) => (
|
||||
{TTS_CONFIGS[props.ttsConfig.engine]!.Voices.map((v, i) => (
|
||||
<option value={v} key={i}>
|
||||
{v}
|
||||
</option>
|
||||
|
@ -22,6 +22,7 @@ export const BAIDU_OATUH_URL = `${BAIDU_BASE_URL}/oauth/2.0/token`;
|
||||
export const BYTEDANCE_BASE_URL = "https://ark.cn-beijing.volces.com";
|
||||
|
||||
export const ALIBABA_BASE_URL = "https://dashscope.aliyuncs.com/api/";
|
||||
export const ALIBABA_RUNTIEM_TTS_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime";
|
||||
|
||||
export const TENCENT_BASE_URL = "https://hunyuan.tencentcloudapi.com";
|
||||
|
||||
@ -232,6 +233,7 @@ export const Alibaba = {
|
||||
}
|
||||
return `v1/services/aigc/text-generation/generation`;
|
||||
},
|
||||
SpeechPath: "v1/services/aigc/multimodal-generation/generation",
|
||||
};
|
||||
|
||||
export const Tencent = {
|
||||
@ -461,19 +463,53 @@ export const KnowledgeCutOffDate: Record<string, string> = {
|
||||
"deepseek-coder": "2024-07",
|
||||
};
|
||||
|
||||
export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
|
||||
export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
|
||||
export const DEFAULT_TTS_ENGINE = ServiceProvider.OpenAI;
|
||||
export const DEFAULT_TTS_MODEL = "tts-1";
|
||||
export const DEFAULT_TTS_VOICE = "alloy";
|
||||
export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
|
||||
export const DEFAULT_TTS_VOICES = [
|
||||
"alloy",
|
||||
"echo",
|
||||
"fable",
|
||||
"onyx",
|
||||
"nova",
|
||||
"shimmer",
|
||||
];
|
||||
|
||||
export const OPENAI_TTS = {
|
||||
Provider: ServiceProvider.OpenAI,
|
||||
ModelProvider: ModelProvider.GPT,
|
||||
Model: ["tts-1", "tts-1-hd"],
|
||||
Voices: ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
|
||||
} as const;
|
||||
|
||||
export const ALIBABA_TTS = {
|
||||
Provider: ServiceProvider.Alibaba,
|
||||
ModelProvider: ModelProvider.Qwen,
|
||||
Model: ["qwen-tts", "qwen-tts-latest"],
|
||||
Voices: ["Chelsie", "Cherry", "Ethan", "Serena", "Dylan", "Jada", "Sunny"],
|
||||
} as const;
|
||||
|
||||
export const EDGE_TTS = {
|
||||
Provider: "Edge" as const,
|
||||
ModelProvider: ModelProvider.GPT,
|
||||
Model: [] as string[],
|
||||
Voices: [] as string[],
|
||||
} as const;
|
||||
|
||||
// 定义支持的 TTS 引擎类型
|
||||
export type TTSEngineType = ServiceProvider.OpenAI | ServiceProvider.Alibaba | "Edge";
|
||||
|
||||
// 从 TTS_CONFIGS 中提取所有可用的引擎、模型和声音
|
||||
export const DEFAULT_TTS_ENGINES = [ServiceProvider.OpenAI, ServiceProvider.Alibaba, "Edge"] as const;
|
||||
export const DEFAULT_TTS_MODELS = [...OPENAI_TTS.Model, ...ALIBABA_TTS.Model] as const;
|
||||
export const DEFAULT_TTS_VOICES = [...OPENAI_TTS.Voices, ...ALIBABA_TTS.Voices] as const;
|
||||
|
||||
// TTS 配置接口
|
||||
interface TTSConfigItem {
|
||||
Provider: ServiceProvider | "Edge";
|
||||
Model: readonly string[];
|
||||
Voices: readonly string[];
|
||||
ModelProvider: ModelProvider;
|
||||
}
|
||||
|
||||
// 使用完整的 Record 而不是 Partial,确保类型安全
|
||||
export const TTS_CONFIGS: Record<TTSEngineType, TTSConfigItem> = {
|
||||
[ServiceProvider.OpenAI]: OPENAI_TTS,
|
||||
[ServiceProvider.Alibaba]: ALIBABA_TTS,
|
||||
Edge: EDGE_TTS,
|
||||
} as const;
|
||||
|
||||
export const VISION_MODEL_REGEXES = [
|
||||
/vision/,
|
||||
@ -497,6 +533,16 @@ export const VISION_MODEL_REGEXES = [
|
||||
|
||||
export const EXCLUDE_VISION_MODEL_REGEXES = [/claude-3-5-haiku-20241022/];
|
||||
|
||||
export const RUNTIME_TTS_OPENAI = {
|
||||
ExampleEndpoint: XAI_BASE_URL,
|
||||
}
|
||||
|
||||
export const REALTIME_TTS_MODELS = {
|
||||
[ServiceProvider.OpenAI]: ["gpt-4o-realtime-preview-2024-10-01"],
|
||||
[ServiceProvider.Azure]: ["gpt-4o-realtime-preview-2024-10-01"],
|
||||
[ServiceProvider.Alibaba]: ["qwen-omni-turbo-realtime"],
|
||||
};
|
||||
|
||||
const openaiModels = [
|
||||
// As of July 2024, gpt-4o-mini should be used in place of gpt-3.5-turbo,
|
||||
// as it is cheaper, more capable, multimodal, and just as fast. gpt-3.5-turbo is still available for use in the API.
|
||||
@ -920,3 +966,4 @@ export const DEFAULT_GA_ID = "G-89WN60ZK2E";
|
||||
|
||||
export const SAAS_CHAT_URL = "https://nextchat.club";
|
||||
export const SAAS_CHAT_UTM_URL = "https://nextchat.club?utm=github";
|
||||
|
||||
|
@ -13,6 +13,8 @@ import {
|
||||
DEFAULT_TTS_VOICES,
|
||||
StoreKey,
|
||||
ServiceProvider,
|
||||
TTSEngineType,
|
||||
ModelProvider,
|
||||
} from "../constant";
|
||||
import { createPersistStore } from "../utils/store";
|
||||
import type { Voice } from "rt-client";
|
||||
@ -20,7 +22,6 @@ import type { Voice } from "rt-client";
|
||||
export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
|
||||
export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
|
||||
export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
|
||||
export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
|
||||
|
||||
export enum SubmitKey {
|
||||
Enter = "Enter",
|
||||
@ -86,7 +87,8 @@ export const DEFAULT_CONFIG = {
|
||||
ttsConfig: {
|
||||
enable: false,
|
||||
autoplay: false,
|
||||
engine: DEFAULT_TTS_ENGINE,
|
||||
modelProvider: ModelProvider.GPT,
|
||||
engine: DEFAULT_TTS_ENGINE as TTSEngineType,
|
||||
model: DEFAULT_TTS_MODEL,
|
||||
voice: DEFAULT_TTS_VOICE,
|
||||
speed: 1.0,
|
||||
@ -126,18 +128,21 @@ export function limitNumber(
|
||||
}
|
||||
|
||||
export const TTSConfigValidator = {
|
||||
engine(x: string) {
|
||||
engine(x: string | TTSEngineType): TTSEngineType {
|
||||
return x as TTSEngineType;
|
||||
},
|
||||
model(x: string) {
|
||||
model(x: string): TTSModelType {
|
||||
return x as TTSModelType;
|
||||
},
|
||||
voice(x: string) {
|
||||
voice(x: string): TTSVoiceType {
|
||||
return x as TTSVoiceType;
|
||||
},
|
||||
speed(x: number) {
|
||||
speed(x: number): number {
|
||||
return limitNumber(x, 0.25, 4.0, 1.0);
|
||||
},
|
||||
modelProvider(x: string): ModelProvider {
|
||||
return x as ModelProvider;
|
||||
},
|
||||
};
|
||||
|
||||
export const ModalConfigValidator = {
|
||||
|
@ -13,13 +13,17 @@ export function createTTSPlayer(): TTSPlayer {
|
||||
audioContext.suspend();
|
||||
};
|
||||
|
||||
const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => {
|
||||
const play = async (audioBuffer: ArrayBuffer | AudioBuffer, onended: () => void | null) => {
|
||||
if (audioBufferSourceNode) {
|
||||
audioBufferSourceNode.stop();
|
||||
audioBufferSourceNode.disconnect();
|
||||
}
|
||||
|
||||
const buffer = await audioContext!.decodeAudioData(audioBuffer);
|
||||
let buffer: AudioBuffer;
|
||||
if (audioBuffer instanceof AudioBuffer) {
|
||||
buffer = audioBuffer;
|
||||
} else {
|
||||
buffer = await audioContext!.decodeAudioData(audioBuffer);
|
||||
}
|
||||
audioBufferSourceNode = audioContext!.createBufferSource();
|
||||
audioBufferSourceNode.buffer = buffer;
|
||||
audioBufferSourceNode.connect(audioContext!.destination);
|
||||
|
@ -93,5 +93,9 @@
|
||||
"resolutions": {
|
||||
"lint-staged/yaml": "^2.2.2"
|
||||
},
|
||||
"packageManager": "yarn@1.22.19"
|
||||
"packageManager": "yarn@1.22.19",
|
||||
"volta": {
|
||||
"node": "20.19.4",
|
||||
"yarn": "1.22.22"
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user