From 853212cf135a97ba8023eda23ea8814a8d8363d8 Mon Sep 17 00:00:00 2001
From: Dakai Ou <oudakai@gmail.com>
Date: Sun, 13 Oct 2024 17:46:08 +0800
Subject: [PATCH] feat: add arrayBufferToWav utility for audio processing

---
 app/components/chat.tsx |  8 ++++---
 app/utils/audio.ts      | 51 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/app/components/chat.tsx b/app/components/chat.tsx
index eb2848f08..81858bc8a 100644
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -117,7 +117,7 @@ import { MultimodalContent } from "../client/api";
 
 const localStorage = safeLocalStorage();
 import { ClientApi } from "../client/api";
-import { createTTSPlayer } from "../utils/audio";
+import { createTTSPlayer, arrayBufferToWav } from "../utils/audio";
 import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
 
 const ttsPlayer = createTTSPlayer();
@@ -1228,11 +1228,13 @@ function _Chat() {
       }
       setSpeechStatus(true);
       try {
+        const waveFile = arrayBufferToWav(audioBuffer);
+        const audioFile = new Blob([waveFile], { type: "audio/wav" });
+        const url = uploadAudio(audioFile);
+
         await ttsPlayer.play(audioBuffer, () => {
           setSpeechStatus(false);
         });
-        const audioFile = new Blob([audioBuffer], { type: "audio/wav" });
-        const url = uploadAudio(audioFile);
       } catch (e) {
         console.error("[OpenAI Speech]", e);
         showToast(prettyObject(e));
diff --git a/app/utils/audio.ts b/app/utils/audio.ts
index f6828c7aa..9af258812 100644
--- a/app/utils/audio.ts
+++ b/app/utils/audio.ts
@@ -43,3 +43,54 @@ export function createTTSPlayer(): TTSPlayer {
 
   return { init, play, stop };
 }
+
+export function arrayBufferToWav(buffer: ArrayBuffer): ArrayBuffer {
+  const numOfChannels = 1; // Mono
+  const sampleRate = 24000; // 24kHz
+  const bitsPerSample = 16;
+
+  const bytesPerSample = bitsPerSample / 8;
+  const blockAlign = numOfChannels * bytesPerSample;
+  const byteRate = sampleRate * blockAlign;
+
+  // WAV header size is 44 bytes
+  const wavHeaderSize = 44;
+  const dataSize = buffer.byteLength;
+  const totalSize = wavHeaderSize + dataSize;
+
+  const wavBuffer = new ArrayBuffer(totalSize);
+  const view = new DataView(wavBuffer);
+
+  // RIFF chunk descriptor
+  writeString(view, 0, "RIFF");
+  view.setUint32(4, totalSize - 8, true); // File size minus RIFF header
+  writeString(view, 8, "WAVE");
+
+  // FMT sub-chunk
+  writeString(view, 12, "fmt ");
+  view.setUint32(16, 16, true); // Sub-chunk size (16 for PCM)
+  view.setUint16(20, 1, true); // Audio format (1 for PCM)
+  view.setUint16(22, numOfChannels, true); // Number of channels
+  view.setUint32(24, sampleRate, true); // Sample rate
+  view.setUint32(28, byteRate, true); // Byte rate
+  view.setUint16(32, blockAlign, true); // Block align
+  view.setUint16(34, bitsPerSample, true); // Bits per sample
+
+  // Data sub-chunk
+  writeString(view, 36, "data");
+  view.setUint32(40, dataSize, true); // Data size
+
+  // Write the PCM samples
+  const audioData = new Uint8Array(buffer);
+  const wavData = new Uint8Array(wavBuffer);
+  wavData.set(audioData, wavHeaderSize);
+
+  return wavBuffer;
+}
+
+// Helper function to write a string to the DataView
+function writeString(view: DataView, offset: number, string: string) {
+  for (let i = 0; i < string.length; i++) {
+    view.setUint8(offset + i, string.charCodeAt(i));
+  }
+}