From 853212cf135a97ba8023eda23ea8814a8d8363d8 Mon Sep 17 00:00:00 2001 From: Dakai Ou Date: Sun, 13 Oct 2024 17:46:08 +0800 Subject: [PATCH] feat: add arrayBufferToWav utility for audio processing --- app/components/chat.tsx | 8 ++++--- app/utils/audio.ts | 51 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/app/components/chat.tsx b/app/components/chat.tsx index eb2848f08..81858bc8a 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -117,7 +117,7 @@ import { MultimodalContent } from "../client/api"; const localStorage = safeLocalStorage(); import { ClientApi } from "../client/api"; -import { createTTSPlayer } from "../utils/audio"; +import { createTTSPlayer, arrayBufferToWav } from "../utils/audio"; import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; const ttsPlayer = createTTSPlayer(); @@ -1228,11 +1228,13 @@ function _Chat() { } setSpeechStatus(true); try { + const waveFile = arrayBufferToWav(audioBuffer); + const audioFile = new Blob([waveFile], { type: "audio/wav" }); + const url = uploadAudio(audioFile); + await ttsPlayer.play(audioBuffer, () => { setSpeechStatus(false); }); - const audioFile = new Blob([audioBuffer], { type: "audio/wav" }); - const url = uploadAudio(audioFile); } catch (e) { console.error("[OpenAI Speech]", e); showToast(prettyObject(e)); diff --git a/app/utils/audio.ts b/app/utils/audio.ts index f6828c7aa..9af258812 100644 --- a/app/utils/audio.ts +++ b/app/utils/audio.ts @@ -43,3 +43,54 @@ export function createTTSPlayer(): TTSPlayer { return { init, play, stop }; } + +export function arrayBufferToWav(buffer: ArrayBuffer): ArrayBuffer { + const numOfChannels = 1; // Mono + const sampleRate = 24000; // 24kHz + const bitsPerSample = 16; + + const bytesPerSample = bitsPerSample / 8; + const blockAlign = numOfChannels * bytesPerSample; + const byteRate = sampleRate * blockAlign; + + // WAV header size is 44 bytes + const wavHeaderSize = 44; + const dataSize = buffer.byteLength; + const totalSize = wavHeaderSize + dataSize; + + const wavBuffer = new ArrayBuffer(totalSize); + const view = new DataView(wavBuffer); + + // RIFF chunk descriptor + writeString(view, 0, "RIFF"); + view.setUint32(4, totalSize - 8, true); // File size minus RIFF header + writeString(view, 8, "WAVE"); + + // FMT sub-chunk + writeString(view, 12, "fmt "); + view.setUint32(16, 16, true); // Sub-chunk size (16 for PCM) + view.setUint16(20, 1, true); // Audio format (1 for PCM) + view.setUint16(22, numOfChannels, true); // Number of channels + view.setUint32(24, sampleRate, true); // Sample rate + view.setUint32(28, byteRate, true); // Byte rate + view.setUint16(32, blockAlign, true); // Block align + view.setUint16(34, bitsPerSample, true); // Bits per sample + + // Data sub-chunk + writeString(view, 36, "data"); + view.setUint32(40, dataSize, true); // Data size + + // Write the PCM samples + const audioData = new Uint8Array(buffer); + const wavData = new Uint8Array(wavBuffer); + wavData.set(audioData, wavHeaderSize); + + return wavBuffer; +} + +// Helper function to write a string to the DataView +function writeString(view: DataView, offset: number, string: string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } +}