diff --git a/app/client/api.ts b/app/client/api.ts index 1da81e964..f7092a987 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -41,6 +41,7 @@ export interface MultimodalContent { export interface RequestMessage { role: MessageRole; content: string | MultimodalContent[]; + audio_url?: string; } export interface LLMConfig { diff --git a/app/components/chat.module.scss b/app/components/chat.module.scss index 73542fc67..32336e95d 100644 --- a/app/components/chat.module.scss +++ b/app/components/chat.module.scss @@ -443,6 +443,10 @@ transition: all ease 0.3s; } +.audio-message { + min-width: 350px; +} + .chat-message-item-image { width: 100%; margin-top: 10px; @@ -471,6 +475,10 @@ border: rgba($color: #888, $alpha: 0.2) 1px solid; } +.chat-message-item-audio { + margin-top: 10px; + width: 100%; +} @media only screen and (max-width: 600px) { $calc-image-width: calc(100vw/3*2/var(--image-count)); @@ -519,7 +527,7 @@ background-color: var(--second); &:hover { - min-width: 0; + //min-width: 350px; } } @@ -693,4 +701,4 @@ .shortcut-key span { font-size: 12px; color: var(--black); -} \ No newline at end of file +} diff --git a/app/components/chat.tsx b/app/components/chat.tsx index cddf0e335..9da496aae 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -116,7 +116,7 @@ import { useAllModels } from "../utils/hooks"; import { MultimodalContent } from "../client/api"; import { ClientApi } from "../client/api"; -import { createTTSPlayer } from "../utils/audio"; +import { createTTSPlayer, arrayBufferToWav } from "../utils/audio"; import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; import { isEmpty } from "lodash-es"; @@ -1132,6 +1132,14 @@ function _Chat() { ); }; + const updateMessageAudio = (msgId?: string, audio_url?: string) => { + chatStore.updateCurrentSession((session) => { + session.messages = session.messages.map((m) => + m.id === msgId ? { ...m, audio_url } : m, + ); + }); + }; + const onDelete = (msgId: string) => { deleteMessage(msgId); }; @@ -1208,7 +1216,7 @@ function _Chat() { const accessStore = useAccessStore(); const [speechStatus, setSpeechStatus] = useState(false); const [speechLoading, setSpeechLoading] = useState(false); - async function openaiSpeech(text: string) { + async function openaiSpeech(text: string): Promise { if (speechStatus) { ttsPlayer.stop(); setSpeechStatus(false); @@ -1238,16 +1246,22 @@ function _Chat() { }); } setSpeechStatus(true); - ttsPlayer - .play(audioBuffer, () => { + try { + const waveFile = arrayBufferToWav(audioBuffer); + const audioFile = new Blob([waveFile], { type: "audio/wav" }); + + const audioUrl: string = await uploadImageRemote(audioFile); + await ttsPlayer.play(audioBuffer, () => { setSpeechStatus(false); - }) - .catch((e) => { - console.error("[OpenAI Speech]", e); - showToast(prettyObject(e)); - setSpeechStatus(false); - }) - .finally(() => setSpeechLoading(false)); + }); + return audioUrl; + } catch (e) { + console.error("[Speech Error]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + } finally { + setSpeechLoading(false); + } } } @@ -1810,9 +1824,12 @@ function _Chat() { ) } - onClick={() => - openaiSpeech(getMessageTextContent(message)) - } + onClick={async () => { + const url = await openaiSpeech( + getMessageTextContent(message), + ); + updateMessageAudio(message.id, url); + }} /> )} @@ -1847,7 +1864,11 @@ function _Chat() { ))} )} -
+
)} + {message.audio_url && ( + + )}
diff --git a/app/icons/play.svg b/app/icons/play.svg new file mode 100644 index 000000000..64e3efca4 --- /dev/null +++ b/app/icons/play.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/icons/stop.svg b/app/icons/stop.svg new file mode 100644 index 000000000..9037b3390 --- /dev/null +++ b/app/icons/stop.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/styles/globals.scss b/app/styles/globals.scss index e8c10de3f..3b8ded1f7 100644 --- a/app/styles/globals.scss +++ b/app/styles/globals.scss @@ -399,3 +399,13 @@ pre { .copyable { user-select: text; } + +audio { + height: 35px; +} +audio::-webkit-media-controls-play-button, +audio::-webkit-media-controls-panel, +audio::-moz-media-controls-play-button, +audio::-moz-media-controls-panel { + background: none; +} diff --git a/app/utils/audio.ts b/app/utils/audio.ts index f6828c7aa..09453d156 100644 --- a/app/utils/audio.ts +++ b/app/utils/audio.ts @@ -43,3 +43,57 @@ export function createTTSPlayer(): TTSPlayer { return { init, play, stop }; } + +export function arrayBufferToWav(buffer: ArrayBuffer): ArrayBuffer { + const numOfChannels = 1; // Mono + const sampleRate = 24000; // 24kHz + const bitsPerSample = 16; + + const bytesPerSample = bitsPerSample / 8; + const blockAlign = numOfChannels * bytesPerSample; + const byteRate = sampleRate * blockAlign; + + // WAV header size is 44 bytes + const wavHeaderSize = 44; + const dataSize = buffer.byteLength; + const totalSize = wavHeaderSize + dataSize; + + const wavBuffer = new ArrayBuffer(totalSize); + const view = new DataView(wavBuffer); + + // RIFF chunk descriptor + writeString(view, 0, "RIFF"); + view.setUint32(4, totalSize - 8, true); // File size minus RIFF header + writeString(view, 8, "WAVE"); + + // FMT sub-chunk + writeString(view, 12, "fmt "); + view.setUint32(16, 16, true); // Sub-chunk size (16 for PCM) + view.setUint16(20, 1, true); // Audio format (1 for PCM) + view.setUint16(22, numOfChannels, true); // Number of channels + view.setUint32(24, sampleRate, true); // Sample rate + view.setUint32(28, byteRate, true); // Byte rate + view.setUint16(32, blockAlign, true); // Block align + view.setUint16(34, bitsPerSample, true); // Bits per sample + + // Data sub-chunk + writeString(view, 36, "data"); + view.setUint32(40, dataSize, true); // Data size + + // Write the PCM samples + const audioData = new Uint8Array(buffer); + const wavData = new Uint8Array(wavBuffer); + wavData.set(audioData, wavHeaderSize); + + return wavBuffer; +} + +// Helper function to write a string to the DataView +function writeString(view: DataView, offset: number, string: string) { + if (offset + string.length > view.byteLength) { + throw new Error("String is too long for the available space in DataView"); + } + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } +}