diff --git a/.yarnrc.yml b/.yarnrc.yml new file mode 100644 index 000000000..3186f3f07 --- /dev/null +++ b/.yarnrc.yml @@ -0,0 +1 @@ +nodeLinker: node-modules diff --git a/app/client/api.ts b/app/client/api.ts index f60b0e2ad..9b82959a8 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -107,7 +107,8 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; - abstract speech(options: SpeechOptions): Promise; + abstract speech(options: SpeechOptions): Promise; + abstract streamSpeech?(options: SpeechOptions): AsyncGenerator; abstract usage(): Promise; abstract models(): Promise; } diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 4875e5c02..fdca6f295 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -6,6 +6,7 @@ import { useChatStore, ChatMessageTool, usePluginStore, + FunctionToolItem, } from "@/app/store"; import { preProcessImageContentForAlibabaDashScope, @@ -51,6 +52,8 @@ interface RequestParam { repetition_penalty?: number; top_p: number; max_tokens?: number; + tools?: FunctionToolItem[]; + enable_search?: boolean; } interface RequestPayload { model: string; @@ -59,6 +62,7 @@ interface RequestPayload { } export class QwenApi implements LLMApi { + private static audioContext: AudioContext | null = null; path(path: string): string { const accessStore = useAccessStore.getState(); @@ -89,10 +93,83 @@ export class QwenApi implements LLMApi { return res?.output?.choices?.at(0)?.message?.content ?? ""; } - speech(options: SpeechOptions): Promise { + async speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } + async *streamSpeech(options: SpeechOptions): AsyncGenerator { + if (!options.input || !options.model) { + throw new Error("Missing required parameters: input and model"); + } + const requestPayload = { + model: options.model, + input: { + text: options.input, + voice: options.voice, + }, + speed: options.speed, + response_format: options.response_format, + }; + const controller = new AbortController(); + options.onController?.(controller); + try { + const speechPath = this.path(Alibaba.SpeechPath); + const speechPayload = { + method: "POST", + body: JSON.stringify(requestPayload), + signal: controller.signal, + headers: { + ...getHeaders(), + "X-DashScope-SSE": "enable", + }, + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + getTimeoutMSByModel(options.model), + ); + + const res = await fetch(speechPath, speechPayload); + clearTimeout(requestTimeoutId); // Clear timeout on successful connection + + const reader = res.body!.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + while (true) { + const { done, value } = await reader.read(); + if (done) { + break; + } + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + + for (const line of lines) { + const data = line.slice(5); + try { + if (line.startsWith("data:")) { + const json = JSON.parse(data); + if (json.output?.audio?.data) { + yield this.PCMBase64ToAudioBuffer(json.output.audio.data); + } + } + } catch (parseError) { + console.warn( + "[StreamSpeech] Failed to parse SSE data:", + parseError, + ); + continue; + } + } + } + reader.releaseLock(); + } catch (e) { + console.log("[Request] failed to make a speech request", e); + throw e; + } + } + async chat(options: ChatOptions) { const modelConfig = { ...useAppConfig.getState().modelConfig, @@ -129,6 +206,7 @@ export class QwenApi implements LLMApi { temperature: modelConfig.temperature, // max_tokens: modelConfig.max_tokens, top_p: modelConfig.top_p === 1 ? 0.99 : modelConfig.top_p, // qwen top_p is should be < 1 + enable_search: modelConfig.enableNetWork, }, }; @@ -161,11 +239,16 @@ export class QwenApi implements LLMApi { .getAsTools( useChatStore.getState().currentSession().mask?.plugin || [], ); + // console.log("getAsTools", tools, funcs); + const _tools = tools as unknown as FunctionToolItem[]; + if (_tools && _tools.length > 0) { + requestPayload.parameters.tools = _tools; + } return streamWithThink( chatPath, requestPayload, headers, - tools as any, + [], funcs, controller, // parseSSE @@ -198,7 +281,7 @@ export class QwenApi implements LLMApi { }); } else { // @ts-ignore - runTools[index]["function"]["arguments"] += args; + runTools[index]["function"]["arguments"] += args || ""; } } @@ -273,5 +356,79 @@ export class QwenApi implements LLMApi { async models(): Promise { return []; } + + // 播放 PCM base64 数据 + private async PCMBase64ToAudioBuffer(base64Data: string) { + try { + // 解码 base64 + const binaryString = atob(base64Data); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + + // 转换为 AudioBuffer + const audioBuffer = await this.convertToAudioBuffer(bytes); + + return audioBuffer; + } catch (error) { + console.error("播放 PCM 数据失败:", error); + throw error; + } + } + + private static getAudioContext(): AudioContext { + if (!QwenApi.audioContext) { + QwenApi.audioContext = new (window.AudioContext || + window.webkitAudioContext)(); + } + return QwenApi.audioContext; + } + + // 将 PCM 字节数据转换为 AudioBuffer + private convertToAudioBuffer(pcmData: Uint8Array) { + const audioContext = QwenApi.getAudioContext(); + const channels = 1; + const sampleRate = 24000; + return new Promise((resolve, reject) => { + try { + let float32Array; + // 16位 PCM 转换为 32位浮点数 + float32Array = this.pcm16ToFloat32(pcmData); + + // 创建 AudioBuffer + const audioBuffer = audioContext.createBuffer( + channels, + float32Array.length / channels, + sampleRate, + ); + + // 复制数据到 AudioBuffer + for (let channel = 0; channel < channels; channel++) { + const channelData = audioBuffer.getChannelData(channel); + for (let i = 0; i < channelData.length; i++) { + channelData[i] = float32Array[i * channels + channel]; + } + } + + resolve(audioBuffer); + } catch (error) { + reject(error); + } + }); + } + // 16位 PCM 转 32位浮点数 + private pcm16ToFloat32(pcmData: Uint8Array) { + const length = pcmData.length / 2; + const float32Array = new Float32Array(length); + + for (let i = 0; i < length; i++) { + const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2]; + const int16Signed = int16 > 32767 ? int16 - 65536 : int16; + float32Array[i] = int16Signed / 32768; + } + + return float32Array; + } } export { Alibaba }; diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 6691403e6..5dbaeaa20 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -48,6 +48,7 @@ import PluginIcon from "../icons/plugin.svg"; import ShortcutkeyIcon from "../icons/shortcutkey.svg"; import McpToolIcon from "../icons/tool.svg"; import HeadphoneIcon from "../icons/headphone.svg"; +import NetWorkIcon from "../icons/network.svg"; import { BOT_HELLO, ChatMessage, @@ -75,6 +76,7 @@ import { useMobileScreen, selectOrCopy, showPlugins, + canUseNetWork, } from "../utils"; import { uploadImage as uploadImageRemote } from "@/app/utils/chat"; @@ -101,8 +103,6 @@ import { import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, - DEFAULT_TTS_ENGINE, - ModelProvider, Path, REQUEST_TIMEOUT_MS, ServiceProvider, @@ -512,6 +512,7 @@ export function ChatActions(props: { // switch themes const theme = config.theme; + const enableNetWork = session.mask.modelConfig.enableNetWork || false; function nextTheme() { const themes = [Theme.Auto, Theme.Light, Theme.Dark]; @@ -521,6 +522,13 @@ export function ChatActions(props: { config.update((config) => (config.theme = nextTheme)); } + function nextNetWork() { + chatStore.updateTargetSession(session, (session) => { + session.mask.modelConfig.enableNetWork = + !session.mask.modelConfig.enableNetWork; + }); + } + // stop all responses const couldStop = ChatControllerPool.hasPending(); const stopAll = () => ChatControllerPool.stopAll(); @@ -699,6 +707,9 @@ export function ChatActions(props: { session.mask.modelConfig.providerName = providerName as ServiceProvider; session.mask.syncGlobalConfig = false; + session.mask.modelConfig.enableNetWork = canUseNetWork(model) + ? session.mask.modelConfig.enableNetWork + : false; }); if (providerName == "ByteDance") { const selectedModel = models.find( @@ -833,6 +844,16 @@ export function ChatActions(props: { /> )} {!isMobileScreen && } + + {canUseNetWork(currentModel) && ( + } + /> + )}
{config.realtimeConfig.enable && ( @@ -1286,6 +1307,7 @@ function _Chat() { const accessStore = useAccessStore(); const [speechStatus, setSpeechStatus] = useState(false); const [speechLoading, setSpeechLoading] = useState(false); + const [speechCooldown, setSpeechCooldown] = useState(false); async function openaiSpeech(text: string) { if (speechStatus) { @@ -1293,14 +1315,14 @@ function _Chat() { setSpeechStatus(false); } else { var api: ClientApi; - api = new ClientApi(ModelProvider.GPT); const config = useAppConfig.getState(); + api = new ClientApi(config.ttsConfig.modelProvider); setSpeechLoading(true); ttsPlayer.init(); - let audioBuffer: ArrayBuffer; + let audioBuffer: ArrayBuffer | AudioBuffer; const { markdownToTxt } = require("markdown-to-txt"); const textContent = markdownToTxt(text); - if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) { + if (config.ttsConfig.engine === "Edge") { const edgeVoiceName = accessStore.edgeVoiceName(); const tts = new MsEdgeTTS(); await tts.setMetadata( @@ -1308,28 +1330,60 @@ function _Chat() { OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3, ); audioBuffer = await tts.toArrayBuffer(textContent); + playSpeech(audioBuffer); } else { - audioBuffer = await api.llm.speech({ - model: config.ttsConfig.model, - input: textContent, - voice: config.ttsConfig.voice, - speed: config.ttsConfig.speed, - }); + if (api.llm.streamSpeech) { + // 使用流式播放,边接收边播放 + setSpeechStatus(true); + ttsPlayer.startStreamPlay(() => { + setSpeechStatus(false); + }); + + try { + for await (const chunk of api.llm.streamSpeech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + })) { + ttsPlayer.addToQueue(chunk); + } + ttsPlayer.finishStreamPlay(); + } catch (e) { + console.error("[Stream Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + ttsPlayer.stop(); + } finally { + setSpeechLoading(false); + } + } else { + audioBuffer = await api.llm.speech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }); + playSpeech(audioBuffer); + } } - setSpeechStatus(true); - ttsPlayer - .play(audioBuffer, () => { - setSpeechStatus(false); - }) - .catch((e) => { - console.error("[OpenAI Speech]", e); - showToast(prettyObject(e)); - setSpeechStatus(false); - }) - .finally(() => setSpeechLoading(false)); } } + function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) { + setSpeechStatus(true); + ttsPlayer + .play(audioBuffer, () => { + setSpeechStatus(false); + }) + .catch((e) => { + console.error("[OpenAI Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + }) + .finally(() => setSpeechLoading(false)); + } + const context: RenderMessage[] = useMemo(() => { return session.mask.hideContext ? [] : session.mask.context.slice(); }, [session.mask.context, session.mask.hideContext]); diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx index 39ae85730..a0ad22fa0 100644 --- a/app/components/tts-config.tsx +++ b/app/components/tts-config.tsx @@ -3,10 +3,9 @@ import { TTSConfig, TTSConfigValidator } from "../store"; import Locale from "../locales"; import { ListItem, Select } from "./ui-lib"; import { - DEFAULT_TTS_ENGINE, - DEFAULT_TTS_ENGINES, - DEFAULT_TTS_MODELS, - DEFAULT_TTS_VOICES, + ServiceProvider, + TTS_CONFIGS, + TTSEngineType } from "../constant"; import { InputRange } from "./input-range"; @@ -48,22 +47,33 @@ export function TTSConfigList(props: { - {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && ( + {(props.ttsConfig.engine === ServiceProvider.OpenAI || + props.ttsConfig.engine === ServiceProvider.Alibaba) && ( <>