Merge 0ea650d958 into 6ded4e96e7

2025-11-16 14:03:43 +08:00 · 2024-11-06 15:28:32 +08:00
parent 6ded4e96e7 0ea650d958
commit 4f30cfc8a9
25 changed files with 582 additions and 8 deletions
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -65,6 +65,16 @@ export interface SpeechOptions {
  onController?: (controller: AbortController) => void;
 }

+export interface TranscriptionOptions {
+  model?: "whisper-1";
+  file: Blob;
+  language?: string;
+  prompt?: string;
+  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
+  temperature?: number;
+  onController?: (controller: AbortController) => void;
+}
+
 export interface ChatOptions {
  messages: RequestMessage[];
  config: LLMConfig;
@@ -100,6 +110,7 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract transcription(options: TranscriptionOptions): Promise<string>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -13,6 +13,7 @@ import {
  LLMApi,
  LLMModel,
  SpeechOptions,
+  TranscriptionOptions,
  MultimodalContent,
 } from "../api";
 import Locale from "../../locales";
@@ -89,6 +90,10 @@ export class QwenApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  async chat(options: ChatOptions) {
    const messages = options.messages.map((v) => ({
      role: v.role,
--- a/app/client/platforms/anthropic.ts
+++ b/app/client/platforms/anthropic.ts
@@ -1,5 +1,11 @@
 import { Anthropic, ApiPath } from "@/app/constant";
-import { ChatOptions, getHeaders, LLMApi, SpeechOptions } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import {
  useAccessStore,
  useAppConfig,
@@ -78,6 +84,10 @@ export class ClaudeApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  extractMessage(res: any) {
    console.log("[Response] claude response: ", res);

--- a/app/client/platforms/baidu.ts
+++ b/app/client/platforms/baidu.ts
@@ -15,6 +15,7 @@ import {
  LLMModel,
  MultimodalContent,
  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -81,6 +82,10 @@ export class ErnieApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  async chat(options: ChatOptions) {
    const messages = options.messages.map((v) => ({
      // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function",
--- a/app/client/platforms/bytedance.ts
+++ b/app/client/platforms/bytedance.ts
@@ -14,6 +14,7 @@ import {
  LLMModel,
  MultimodalContent,
  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -83,6 +84,10 @@ export class DoubaoApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  async chat(options: ChatOptions) {
    const messages = options.messages.map((v) => ({
      role: v.role,
--- a/app/client/platforms/google.ts
+++ b/app/client/platforms/google.ts
@@ -6,6 +6,7 @@ import {
  LLMModel,
  LLMUsage,
  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import {
  useAccessStore,
@@ -68,6 +69,10 @@ export class GeminiProApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  async chat(options: ChatOptions): Promise<void> {
    const apiClient = this;
    let multimodal = false;
--- a/app/client/platforms/iflytek.ts
+++ b/app/client/platforms/iflytek.ts
@@ -13,6 +13,7 @@ import {
  LLMApi,
  LLMModel,
  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -64,6 +65,10 @@ export class SparkApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  async chat(options: ChatOptions) {
    const messages: ChatOptions["messages"] = [];
    for (const v of options.messages) {
--- a/app/client/platforms/moonshot.ts
+++ b/app/client/platforms/moonshot.ts
@@ -20,6 +20,7 @@ import {
  LLMApi,
  LLMModel,
  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import { getClientConfig } from "@/app/config/client";
 import { getMessageTextContent } from "@/app/utils";
@@ -64,6 +65,10 @@ export class MoonshotApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  async chat(options: ChatOptions) {
    const messages: ChatOptions["messages"] = [];
    for (const v of options.messages) {
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@@ -34,6 +34,7 @@ import {
  LLMUsage,
  MultimodalContent,
  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import { getClientConfig } from "@/app/config/client";
@@ -181,6 +182,47 @@ export class ChatGPTApi implements LLMApi {
    }
  }

+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers: headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      const json = await res.json();
+      return json.text;
+    } catch (e) {
+      console.log("[Request] failed to make a audio transcriptions request", e);
+      throw e;
+    }
+  }
+
  async chat(options: ChatOptions) {
    const modelConfig = {
      ...useAppConfig.getState().modelConfig,
--- a/app/client/platforms/tencent.ts
+++ b/app/client/platforms/tencent.ts
@@ -9,6 +9,7 @@ import {
  LLMModel,
  MultimodalContent,
  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -93,6 +94,10 @@ export class HunyuanApi implements LLMApi {
    throw new Error("Method not implemented.");
  }

+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
  async chat(options: ChatOptions) {
    const visionModel = isVisionModel(options.config.model);
    const messages = options.messages.map((v, index) => ({
--- a/app/components/chat.module.scss
+++ b/app/components/chat.module.scss
@@ -75,6 +75,14 @@
      pointer-events: none;
    }

+    &.listening {
+      width: var(--full-width);
+      .text {
+        opacity: 1;
+        transform: translate(0);
+      }
+    }
+
    &:hover {
      --delay: 0.5s;
      width: var(--full-width);
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -10,6 +10,8 @@ import React, {
 } from "react";

 import SendWhiteIcon from "../icons/send-white.svg";
+import VoiceOpenIcon from "../icons/vioce-open.svg";
+import VoiceCloseIcon from "../icons/vioce-close.svg";
 import BrainIcon from "../icons/brain.svg";
 import RenameIcon from "../icons/rename.svg";
 import ExportIcon from "../icons/share.svg";
@@ -72,6 +74,7 @@ import {
  isDalle3,
  showPlugins,
  safeLocalStorage,
+  isFirefox,
 } from "../utils";

 import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
@@ -98,7 +101,9 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
+  DEFAULT_STT_ENGINE,
  DEFAULT_TTS_ENGINE,
+  FIREFOX_DEFAULT_STT_ENGINE,
  ModelProvider,
  Path,
  REQUEST_TIMEOUT_MS,
@@ -117,6 +122,7 @@ import { MultimodalContent } from "../client/api";

 import { ClientApi } from "../client/api";
 import { createTTSPlayer } from "../utils/audio";
+import { OpenAITranscriptionApi, WebTranscriptionApi } from "../utils/speech";
 import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";

 import { isEmpty } from "lodash-es";
@@ -374,6 +380,7 @@ export function ChatAction(props: {
  text: string;
  icon: JSX.Element;
  onClick: () => void;
+  isListening?: boolean;
 }) {
  const iconRef = useRef<HTMLDivElement>(null);
  const textRef = useRef<HTMLDivElement>(null);
@@ -395,7 +402,9 @@ export function ChatAction(props: {

  return (
    <div
-      className={`${styles["chat-input-action"]} clickable`}
+      className={`${styles["chat-input-action"]} clickable ${
+        props.isListening ? styles["listening"] : ""
+      }`}
      onClick={() => {
        props.onClick();
        setTimeout(updateWidth, 1);
@@ -553,6 +562,61 @@ export function ChatActions(props: {
    }
  }, [chatStore, currentModel, models, session]);

+  const [isListening, setIsListening] = useState(false);
+  const [isTranscription, setIsTranscription] = useState(false);
+  const [speechApi, setSpeechApi] = useState<any>(null);
+
+  useEffect(() => {
+    if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
+    const lang = config.sttConfig.lang;
+    setSpeechApi(
+      config.sttConfig.engine !== DEFAULT_STT_ENGINE
+        ? new WebTranscriptionApi(
+            (transcription) => onRecognitionEnd(transcription),
+            lang,
+          )
+        : new OpenAITranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          ),
+    );
+  }, []);
+
+  function playSound(fileName: string) {
+    const audio = new Audio(fileName);
+    audio.play().catch((error) => {
+      console.error("error:", error);
+    });
+  }
+
+  const startListening = async () => {
+    playSound("/Recordingstart.mp3");
+    showToast(Locale.Chat.StartSpeak);
+    if (speechApi) {
+      await speechApi.start();
+      setIsListening(true);
+      document.getElementById("chat-input")?.focus();
+    }
+  };
+  const stopListening = async () => {
+    showToast(Locale.Chat.CloseSpeak);
+    if (speechApi) {
+      if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+        setIsTranscription(true);
+      await speechApi.stop();
+      setIsListening(false);
+    }
+    playSound("/Recordingdone.mp3");
+    document.getElementById("chat-input")?.focus();
+  };
+  const onRecognitionEnd = (finalTranscript: string) => {
+    console.log(finalTranscript);
+    if (finalTranscript) {
+      props.setUserInput((prevInput) => prevInput + finalTranscript);
+    }
+    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+      setIsTranscription(false);
+  };
+
  return (
    <div className={styles["chat-input-actions"]}>
      {couldStop && (
@@ -787,6 +851,17 @@ export function ChatActions(props: {
          icon={<ShortcutkeyIcon />}
        />
      )}
+
+      {config.sttConfig.enable && (
+        <ChatAction
+          onClick={async () =>
+            isListening ? await stopListening() : await startListening()
+          }
+          text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak}
+          icon={isListening ? <VoiceOpenIcon /> : <VoiceCloseIcon />}
+          isListening={isListening}
+        />
+      )}
    </div>
  );
 }
@@ -1516,7 +1591,7 @@ function _Chat() {
    setAttachImages(images);
  }

-  // 快捷键 shortcut keys
+  // 快捷键
  const [showShortcutKeyModal, setShowShortcutKeyModal] = useState(false);

  useEffect(() => {
--- a/app/components/markdown.tsx
+++ b/app/components/markdown.tsx
@@ -193,7 +193,9 @@ function CustomCode(props: { children: any; className?: string }) {
  const renderShowMoreButton = () => {
    if (showToggle && enableCodeFold && collapsed) {
      return (
-        <div className={`show-hide-button ${collapsed ? "collapsed" : "expanded"}`}>
+        <div
+          className={`show-hide-button ${collapsed ? "collapsed" : "expanded"}`}
+        >
          <button onClick={toggleCollapsed}>{Locale.NewChat.More}</button>
        </div>
      );
--- a/app/components/settings.tsx
+++ b/app/components/settings.tsx
@@ -85,6 +85,7 @@ import { nanoid } from "nanoid";
 import { useMaskStore } from "../store/mask";
 import { ProviderType } from "../utils/cloud";
 import { TTSConfigList } from "./tts-config";
+import { STTConfigList } from "./stt-config";

 function EditPromptModal(props: { id: string; onClose: () => void }) {
  const promptStore = usePromptStore();
@@ -1811,6 +1812,17 @@ export function Settings() {
          />
        </List>

+        <List>
+          <STTConfigList
+            sttConfig={config.sttConfig}
+            updateConfig={(updater) => {
+              const sttConfig = { ...config.sttConfig };
+              updater(sttConfig);
+              config.update((config) => (config.sttConfig = sttConfig));
+            }}
+          />
+        </List>
+
        <DangerItems />
      </div>
    </ErrorBoundary>
--- a/app/components/stt-config.tsx
+++ b/app/components/stt-config.tsx
@@ -0,0 +1,75 @@
+import { STTConfig, STTConfigValidator } from "../store";
+
+import Locale from "../locales";
+import { ListItem, Select } from "./ui-lib";
+import { DEFAULT_STT_ENGINES, DEFAULT_STT_LANGUAGES } from "../constant";
+import { isFirefox } from "../utils";
+
+export function STTConfigList(props: {
+  sttConfig: STTConfig;
+  updateConfig: (updater: (config: STTConfig) => void) => void;
+}) {
+  return (
+    <>
+      <ListItem
+        title={Locale.Settings.STT.Enable.Title}
+        subTitle={Locale.Settings.STT.Enable.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.sttConfig.enable}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => (config.enable = e.currentTarget.checked),
+            )
+          }
+        ></input>
+      </ListItem>
+      <ListItem title={Locale.Settings.STT.Engine.Title}>
+        <Select
+          value={props.sttConfig.engine}
+          onChange={(e) => {
+            props.updateConfig(
+              (config) =>
+                (config.engine = STTConfigValidator.engine(
+                  e.currentTarget.value,
+                )),
+            );
+          }}
+        >
+          {isFirefox()
+            ? DEFAULT_STT_ENGINES.filter((v) => v !== "Web Speech API").map(
+                (v, i) => (
+                  <option value={v} key={i}>
+                    {v}
+                  </option>
+                ),
+              )
+            : DEFAULT_STT_ENGINES.map((v, i) => (
+                <option value={v} key={i}>
+                  {v}
+                </option>
+              ))}
+        </Select>
+      </ListItem>
+      {props.sttConfig.engine === "Web Speech API" && !isFirefox() && (
+        <ListItem title="语言选择">
+          <Select
+            value={props.sttConfig.lang}
+            onChange={(e) => {
+              props.updateConfig(
+                (config) => (config.lang = e.currentTarget.value),
+              );
+            }}
+          >
+            {DEFAULT_STT_LANGUAGES.map((v, i) => (
+              <option value={v} key={i}>
+                {v}
+              </option>
+            ))}
+          </Select>
+        </ListItem>
+      )}
+    </>
+  );
+}
--- a/app/components/stt.module.scss
+++ b/app/components/stt.module.scss
@@ -0,0 +1,119 @@
+@import "../styles/animation.scss";
+.plugin-page {
+  height: 100%;
+  display: flex;
+  flex-direction: column;
+
+  .plugin-page-body {
+    padding: 20px;
+    overflow-y: auto;
+
+    .plugin-filter {
+      width: 100%;
+      max-width: 100%;
+      margin-bottom: 20px;
+      animation: slide-in ease 0.3s;
+      height: 40px;
+
+      display: flex;
+
+      .search-bar {
+        flex-grow: 1;
+        max-width: 100%;
+        min-width: 0;
+        outline: none;
+      }
+
+      .search-bar:focus {
+        border: 1px solid var(--primary);
+      }
+
+      .plugin-filter-lang {
+        height: 100%;
+        margin-left: 10px;
+      }
+
+      .plugin-create {
+        height: 100%;
+        margin-left: 10px;
+        box-sizing: border-box;
+        min-width: 80px;
+      }
+    }
+
+    .plugin-item {
+      display: flex;
+      justify-content: space-between;
+      padding: 20px;
+      border: var(--border-in-light);
+      animation: slide-in ease 0.3s;
+
+      &:not(:last-child) {
+        border-bottom: 0;
+      }
+
+      &:first-child {
+        border-top-left-radius: 10px;
+        border-top-right-radius: 10px;
+      }
+
+      &:last-child {
+        border-bottom-left-radius: 10px;
+        border-bottom-right-radius: 10px;
+      }
+
+      .plugin-header {
+        display: flex;
+        align-items: center;
+
+        .plugin-icon {
+          display: flex;
+          align-items: center;
+          justify-content: center;
+          margin-right: 10px;
+        }
+
+        .plugin-title {
+          .plugin-name {
+            font-size: 14px;
+            font-weight: bold;
+          }
+          .plugin-info {
+            font-size: 12px;
+          }
+          .plugin-runtime-warning {
+            font-size: 12px;
+            color: #f86c6c;
+          }
+        }
+      }
+
+      .plugin-actions {
+        display: flex;
+        flex-wrap: nowrap;
+        transition: all ease 0.3s;
+        justify-content: center;
+        align-items: center;
+      }
+
+      @media screen and (max-width: 600px) {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        border-radius: 10px;
+        margin-bottom: 20px;
+        box-shadow: var(--card-shadow);
+
+        &:not(:last-child) {
+          border-bottom: var(--border-in-light);
+        }
+
+        .plugin-actions {
+          width: 100%;
+          justify-content: space-between;
+          padding-top: 10px;
+        }
+      }
+    }
+  }
+}
--- a/app/constant.ts
+++ b/app/constant.ts
@@ -160,6 +160,7 @@ export const Anthropic = {
 export const OpenaiPath = {
  ChatPath: "v1/chat/completions",
  SpeechPath: "v1/audio/speech",
+  TranscriptionPath: "v1/audio/transcriptions",
  ImagePath: "v1/images/generations",
  UsagePath: "dashboard/billing/usage",
  SubsPath: "dashboard/billing/subscription",
@@ -290,6 +291,24 @@ export const DEFAULT_TTS_VOICES = [
  "shimmer",
 ];

+export const DEFAULT_STT_ENGINE = "OpenAI Whisper";
+export const DEFAULT_STT_ENGINES = ["OpenAI Whisper", "Web Speech API"];
+export const DEFAULT_STT_LANGUAGE = "zh-CN";
+export const DEFAULT_STT_LANGUAGES = [
+  "zh-CN", // 中文（简体）
+  "en-US", // 英文
+  "fr-FR", // 法文
+  "de-DE", // 德文
+  "es-ES", // 西班牙文
+  "it-IT", // 意大利文
+  "ja-JP", // 日文
+  "ko-KR", // 韩文
+  "ru-RU", // 俄文
+  "pt-BR", // 葡萄牙文（巴西）
+  "ar-SA", // 阿拉伯文
+];
+export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper";
+
 const openaiModels = [
  "gpt-3.5-turbo",
  "gpt-3.5-turbo-1106",
--- a/app/icons/vioce-close.svg
+++ b/app/icons/vioce-close.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="none" stroke="currentColor" stroke-width="1.5"><path d="M7 8a5 5 0 0 1 10 0v3a5 5 0 0 1-10 0z"/><path stroke-linecap="round" d="M11 8h2m-3 3h4m6-1v1a8 8 0 1 1-16 0v-1m8 9v3"/></g></svg>
--- a/app/icons/vioce-open.svg
+++ b/app/icons/vioce-open.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="currentColor" fill-rule="evenodd" clip-rule="evenodd"><path d="M4 9a.75.75 0 0 1 .75.75v1a7.25 7.25 0 1 0 14.5 0v-1a.75.75 0 0 1 1.5 0v1a8.75 8.75 0 0 1-8 8.718v2.282a.75.75 0 0 1-1.5 0v-2.282a8.75 8.75 0 0 1-8-8.718v-1A.75.75 0 0 1 4 9"/><path d="M12 2a5.75 5.75 0 0 0-5.75 5.75v3a5.75 5.75 0 0 0 11.5 0v-3A5.75 5.75 0 0 0 12 2m2 9.5a.75.75 0 0 0 0-1.5h-4a.75.75 0 0 0 0 1.5zm-.25-3.75a.75.75 0 0 1-.75.75h-2A.75.75 0 0 1 11 7h2a.75.75 0 0 1 .75.75"/></g></svg>
--- a/app/locales/cn.ts
+++ b/app/locales/cn.ts
@@ -92,8 +92,9 @@ const cn = {
      return inputHints + "，/ 触发补全，: 触发命令";
    },
    Send: "发送",
-    StartSpeak: "说话",
-    StopSpeak: "停止",
+    StartSpeak: "启用语音输入",
+    CloseSpeak: "关闭语音输入",
+    StopSpeak: "录音中....点击结束",
    Config: {
      Reset: "清除记忆",
      SaveAs: "存为面具",
@@ -562,6 +563,16 @@ const cn = {
        SubTitle: "生成语音的速度",
      },
    },
+    STT: {
+      Enable: {
+        Title: "启用语音转文本",
+        SubTitle: "启用语音转文本",
+      },
+      Engine: {
+        Title: "转换引擎",
+        SubTitle: "音频转换引擎",
+      },
+    },
  },
  Store: {
    DefaultTopic: "新的聊天",
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@@ -94,7 +94,8 @@ const en: LocaleType = {
    },
    Send: "Send",
    StartSpeak: "Start Speak",
-    StopSpeak: "Stop Speak",
+    CloseSpeak: "Stop Speak",
+    StopSpeak: "Recording...",
    Config: {
      Reset: "Reset to Default",
      SaveAs: "Save as Mask",
@@ -570,6 +571,16 @@ const en: LocaleType = {
      },
      Engine: "TTS Engine",
    },
+    STT: {
+      Enable: {
+        Title: "Enable STT",
+        SubTitle: "Enable Speech-to-Text",
+      },
+      Engine: {
+        Title: "STT Engine",
+        SubTitle: "Text-to-Speech Engine",
+      },
+    },
  },
  Store: {
    DefaultTopic: "New Conversation",
--- a/app/store/config.ts
+++ b/app/store/config.ts
@@ -5,6 +5,9 @@ import {
  DEFAULT_INPUT_TEMPLATE,
  DEFAULT_MODELS,
  DEFAULT_SIDEBAR_WIDTH,
+  DEFAULT_STT_ENGINE,
+  DEFAULT_STT_ENGINES,
+  DEFAULT_STT_LANGUAGE,
  DEFAULT_TTS_ENGINE,
  DEFAULT_TTS_ENGINES,
  DEFAULT_TTS_MODEL,
@@ -20,6 +23,7 @@ export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
 export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
 export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
 export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
+export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];

 export enum SubmitKey {
  Enter = "Enter",
@@ -83,19 +87,25 @@ export const DEFAULT_CONFIG = {
  },

  ttsConfig: {
-    enable: false,
+    enable: true,
    autoplay: false,
    engine: DEFAULT_TTS_ENGINE,
    model: DEFAULT_TTS_MODEL,
    voice: DEFAULT_TTS_VOICE,
    speed: 1.0,
  },
+  sttConfig: {
+    enable: true,
+    engine: DEFAULT_STT_ENGINE,
+    lang: DEFAULT_STT_LANGUAGE,
+  },
 };

 export type ChatConfig = typeof DEFAULT_CONFIG;

 export type ModelConfig = ChatConfig["modelConfig"];
 export type TTSConfig = ChatConfig["ttsConfig"];
+export type STTConfig = ChatConfig["sttConfig"];

 export function limitNumber(
  x: number,
@@ -125,6 +135,12 @@ export const TTSConfigValidator = {
  },
 };

+export const STTConfigValidator = {
+  engine(x: string) {
+    return x as STTEngineType;
+  },
+};
+
 export const ModalConfigValidator = {
  model(x: string) {
    return x as ModelType;
--- a/app/utils/speech.ts
+++ b/app/utils/speech.ts
@@ -0,0 +1,126 @@
+import { ChatGPTApi } from "../client/platforms/openai";
+import { getSTTLang } from "../locales";
+import { isFirefox } from "../utils";
+
+export type TranscriptionCallback = (transcription: string) => void;
+
+export abstract class SpeechApi {
+  protected onTranscription: TranscriptionCallback = () => {};
+
+  abstract isListening(): boolean;
+  abstract start(): Promise<void>;
+  abstract stop(): Promise<void>;
+
+  onTranscriptionReceived(callback: TranscriptionCallback) {
+    this.onTranscription = callback;
+  }
+}
+
+export class OpenAITranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private mediaRecorder: MediaRecorder | null = null;
+  private stream: MediaStream | null = null;
+  private audioChunks: Blob[] = [];
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback) {
+    super();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+  }
+
+  async start(): Promise<void> {
+    // @ts-ignore
+    navigator.getUserMedia =
+      // @ts-ignore
+      navigator.getUserMedia ||
+      // @ts-ignore
+      navigator.webkitGetUserMedia ||
+      // @ts-ignore
+      navigator.mozGetUserMedia ||
+      // @ts-ignore
+      navigator.msGetUserMedia;
+    if (navigator.mediaDevices) {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      this.mediaRecorder = new MediaRecorder(stream);
+      this.mediaRecorder.ondataavailable = (e) => {
+        if (e.data && e.data.size > 0) {
+          this.audioChunks.push(e.data);
+        }
+      };
+
+      this.stream = stream;
+    } else {
+      console.warn("Media Decives will work only with SSL");
+      return;
+    }
+
+    this.audioChunks = [];
+
+    // this.recorder.addEventListener("dataavailable", (event) => {
+    //     this.audioChunks.push(event.data);
+    // });
+
+    this.mediaRecorder.start(1000);
+    this.listeningStatus = true;
+  }
+
+  async stop(): Promise<void> {
+    if (!this.mediaRecorder || !this.listeningStatus) {
+      return;
+    }
+
+    return new Promise((resolve) => {
+      this.mediaRecorder!.addEventListener("stop", async () => {
+        const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
+        const llm = new ChatGPTApi();
+        const transcription = await llm.transcription({ file: audioBlob });
+        this.onTranscription(transcription);
+        this.listeningStatus = false;
+        resolve();
+      });
+
+      this.mediaRecorder!.stop();
+    });
+  }
+}
+
+export class WebTranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private recognitionInstance: any | null = null;
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback, lang?: string) {
+    super();
+    if (isFirefox()) return;
+    const SpeechRecognition =
+      (window as any).SpeechRecognition ||
+      (window as any).webkitSpeechRecognition;
+    this.recognitionInstance = new SpeechRecognition();
+    this.recognitionInstance.continuous = true;
+    this.recognitionInstance.interimResults = true;
+    this.recognitionInstance.lang = lang ?? getSTTLang();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+    this.recognitionInstance.onresult = (event: any) => {
+      const result = event.results[event.results.length - 1];
+      if (result.isFinal) {
+        this.onTranscription(result[0].transcript);
+      }
+    };
+  }
+
+  async start(): Promise<void> {
+    this.listeningStatus = true;
+    await this.recognitionInstance.start();
+  }
+
+  async stop(): Promise<void> {
+    this.listeningStatus = false;
+    await this.recognitionInstance.stop();
+  }
+}
--- a/public/Recordingdone.mp3
+++ b/public/Recordingdone.mp3
--- a/public/Recordingstart.mp3
+++ b/public/Recordingstart.mp3
				`@@ -0,0 +1 @@`
				`<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="none" stroke="currentColor" stroke-width="1.5"><path d="M7 8a5 5 0 0 1 10 0v3a5 5 0 0 1-10 0z"/><path stroke-linecap="round" d="M11 8h2m-3 3h4m6-1v1a8 8 0 1 1-16 0v-1m8 9v3"/></g></svg>`
				`@@ -0,0 +1 @@`
				<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="currentColor" fill-rule="evenodd" clip-rule="evenodd"><path d="M4 9a.75.75 0 0 1 .75.75v1a7.25 7.25 0 1 0 14.5 0v-1a.75.75 0 0 1 1.5 0v1a8.75 8.75 0 0 1-8 8.718v2.282a.75.75 0 0 1-1.5 0v-2.282a8.75 8.75 0 0 1-8-8.718v-1A.75.75 0 0 1 4 9"/><path d="M12 2a5.75 5.75 0 0 0-5.75 5.75v3a5.75 5.75 0 0 0 11.5 0v-3A5.75 5.75 0 0 0 12 2m2 9.5a.75.75 0 0 0 0-1.5h-4a.75.75 0 0 0 0 1.5zm-.25-3.75a.75.75 0 0 1-.75.75h-2A.75.75 0 0 1 11 7h2a.75.75 0 0 1 .75.75"/></g></svg>