Merge 0ea650d958 into 6ded4e96e7

2025-11-17 06:23:44 +08:00 · 2024-11-06 15:28:32 +08:00
parent 6ded4e96e7 0ea650d958
commit 4f30cfc8a9
25 changed files with 582 additions and 8 deletions
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -65,6 +65,16 @@ export interface SpeechOptions {
  onController?: (controller: AbortController) => void;
 }
 export interface TranscriptionOptions {
  model?: "whisper-1";
  file: Blob;
  language?: string;
  prompt?: string;
  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
  temperature?: number;
  onController?: (controller: AbortController) => void;
 }
 export interface ChatOptions {
  messages: RequestMessage[];
  config: LLMConfig;
@@ -100,6 +110,7 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
  abstract chat(options: ChatOptions): Promise<void>;
  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
  abstract transcription(options: TranscriptionOptions): Promise<string>;
  abstract usage(): Promise<LLMUsage>;
  abstract models(): Promise<LLMModel[]>;
 }
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -13,6 +13,7 @@ import {
  LLMApi,
  LLMModel,
  SpeechOptions,
  TranscriptionOptions,
  MultimodalContent,
 } from "../api";
 import Locale from "../../locales";
@@ -89,6 +90,10 @@ export class QwenApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  async chat(options: ChatOptions) {
    const messages = options.messages.map((v) => ({
      role: v.role,
--- a/app/client/platforms/anthropic.ts
+++ b/app/client/platforms/anthropic.ts
@@ -1,5 +1,11 @@
 import { Anthropic, ApiPath } from "@/app/constant";
-import { ChatOptions, getHeaders, LLMApi, SpeechOptions } from "../api";
+import {
  ChatOptions,
  getHeaders,
  LLMApi,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import {
  useAccessStore,
  useAppConfig,
@@ -78,6 +84,10 @@ export class ClaudeApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  extractMessage(res: any) {
    console.log("[Response] claude response: ", res);
--- a/app/client/platforms/baidu.ts
+++ b/app/client/platforms/baidu.ts
@@ -15,6 +15,7 @@ import {
  LLMModel,
  MultimodalContent,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -81,6 +82,10 @@ export class ErnieApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  async chat(options: ChatOptions) {
    const messages = options.messages.map((v) => ({
      // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function",
--- a/app/client/platforms/bytedance.ts
+++ b/app/client/platforms/bytedance.ts
@@ -14,6 +14,7 @@ import {
  LLMModel,
  MultimodalContent,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -83,6 +84,10 @@ export class DoubaoApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  async chat(options: ChatOptions) {
    const messages = options.messages.map((v) => ({
      role: v.role,
--- a/app/client/platforms/google.ts
+++ b/app/client/platforms/google.ts
@@ -6,6 +6,7 @@ import {
  LLMModel,
  LLMUsage,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import {
  useAccessStore,
@@ -68,6 +69,10 @@ export class GeminiProApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  async chat(options: ChatOptions): Promise<void> {
    const apiClient = this;
    let multimodal = false;
--- a/app/client/platforms/iflytek.ts
+++ b/app/client/platforms/iflytek.ts
@@ -13,6 +13,7 @@ import {
  LLMApi,
  LLMModel,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -64,6 +65,10 @@ export class SparkApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  async chat(options: ChatOptions) {
    const messages: ChatOptions["messages"] = [];
    for (const v of options.messages) {
--- a/app/client/platforms/moonshot.ts
+++ b/app/client/platforms/moonshot.ts
@@ -20,6 +20,7 @@ import {
  LLMApi,
  LLMModel,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import { getClientConfig } from "@/app/config/client";
 import { getMessageTextContent } from "@/app/utils";
@@ -64,6 +65,10 @@ export class MoonshotApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  async chat(options: ChatOptions) {
    const messages: ChatOptions["messages"] = [];
    for (const v of options.messages) {
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@@ -34,6 +34,7 @@ import {
  LLMUsage,
  MultimodalContent,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import { getClientConfig } from "@/app/config/client";
@@ -181,6 +182,47 @@ export class ChatGPTApi implements LLMApi {
    }
  }
  async transcription(options: TranscriptionOptions): Promise<string> {
    const formData = new FormData();
    formData.append("file", options.file, "audio.wav");
    formData.append("model", options.model ?? "whisper-1");
    if (options.language) formData.append("language", options.language);
    if (options.prompt) formData.append("prompt", options.prompt);
    if (options.response_format)
      formData.append("response_format", options.response_format);
    if (options.temperature)
      formData.append("temperature", options.temperature.toString());
    console.log("[Request] openai audio transcriptions payload: ", options);
    const controller = new AbortController();
    options.onController?.(controller);
    try {
      const path = this.path(OpenaiPath.TranscriptionPath);
      const headers = getHeaders(true);
      const payload = {
        method: "POST",
        body: formData,
        signal: controller.signal,
        headers: headers,
      };
      // make a fetch request
      const requestTimeoutId = setTimeout(
        () => controller.abort(),
        REQUEST_TIMEOUT_MS,
      );
      const res = await fetch(path, payload);
      clearTimeout(requestTimeoutId);
      const json = await res.json();
      return json.text;
    } catch (e) {
      console.log("[Request] failed to make a audio transcriptions request", e);
      throw e;
    }
  }
  async chat(options: ChatOptions) {
    const modelConfig = {
      ...useAppConfig.getState().modelConfig,
--- a/app/client/platforms/tencent.ts
+++ b/app/client/platforms/tencent.ts
@@ -9,6 +9,7 @@ import {
  LLMModel,
  MultimodalContent,
  SpeechOptions,
  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -93,6 +94,10 @@ export class HunyuanApi implements LLMApi {
    throw new Error("Method not implemented.");
  }
  transcription(options: TranscriptionOptions): Promise<string> {
    throw new Error("Method not implemented.");
  }
  async chat(options: ChatOptions) {
    const visionModel = isVisionModel(options.config.model);
    const messages = options.messages.map((v, index) => ({
--- a/app/components/chat.module.scss
+++ b/app/components/chat.module.scss
@@ -75,6 +75,14 @@
      pointer-events: none;
    }
    &.listening {
      width: var(--full-width);
      .text {
        opacity: 1;
        transform: translate(0);
      }
    }
    &:hover {
      --delay: 0.5s;
      width: var(--full-width);
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -10,6 +10,8 @@ import React, {
 } from "react";
 import SendWhiteIcon from "../icons/send-white.svg";
 import VoiceOpenIcon from "../icons/vioce-open.svg";
 import VoiceCloseIcon from "../icons/vioce-close.svg";
 import BrainIcon from "../icons/brain.svg";
 import RenameIcon from "../icons/rename.svg";
 import ExportIcon from "../icons/share.svg";
@@ -72,6 +74,7 @@ import {
  isDalle3,
  showPlugins,
  safeLocalStorage,
  isFirefox,
 } from "../utils";
 import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
@@ -98,7 +101,9 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
  CHAT_PAGE_SIZE,
  DEFAULT_STT_ENGINE,
  DEFAULT_TTS_ENGINE,
  FIREFOX_DEFAULT_STT_ENGINE,
  ModelProvider,
  Path,
  REQUEST_TIMEOUT_MS,
@@ -117,6 +122,7 @@ import { MultimodalContent } from "../client/api";
 import { ClientApi } from "../client/api";
 import { createTTSPlayer } from "../utils/audio";
 import { OpenAITranscriptionApi, WebTranscriptionApi } from "../utils/speech";
 import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
 import { isEmpty } from "lodash-es";
@@ -374,6 +380,7 @@ export function ChatAction(props: {
  text: string;
  icon: JSX.Element;
  onClick: () => void;
  isListening?: boolean;
 }) {
  const iconRef = useRef<HTMLDivElement>(null);
  const textRef = useRef<HTMLDivElement>(null);
@@ -395,7 +402,9 @@ export function ChatAction(props: {
  return (
    <div
-      className={`${styles["chat-input-action"]} clickable`}
+      className={`${styles["chat-input-action"]} clickable ${
        props.isListening ? styles["listening"] : ""
      }`}
      onClick={() => {
        props.onClick();
        setTimeout(updateWidth, 1);
@@ -553,6 +562,61 @@ export function ChatActions(props: {
    }
  }, [chatStore, currentModel, models, session]);
  const [isListening, setIsListening] = useState(false);
  const [isTranscription, setIsTranscription] = useState(false);
  const [speechApi, setSpeechApi] = useState<any>(null);
  useEffect(() => {
    if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
    const lang = config.sttConfig.lang;
    setSpeechApi(
      config.sttConfig.engine !== DEFAULT_STT_ENGINE
        ? new WebTranscriptionApi(
            (transcription) => onRecognitionEnd(transcription),
            lang,
          )
        : new OpenAITranscriptionApi((transcription) =>
            onRecognitionEnd(transcription),
          ),
    );
  }, []);
  function playSound(fileName: string) {
    const audio = new Audio(fileName);
    audio.play().catch((error) => {
      console.error("error:", error);
    });
  }
  const startListening = async () => {
    playSound("/Recordingstart.mp3");
    showToast(Locale.Chat.StartSpeak);
    if (speechApi) {
      await speechApi.start();
      setIsListening(true);
      document.getElementById("chat-input")?.focus();
    }
  };
  const stopListening = async () => {
    showToast(Locale.Chat.CloseSpeak);
    if (speechApi) {
      if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
        setIsTranscription(true);
      await speechApi.stop();
      setIsListening(false);
    }
    playSound("/Recordingdone.mp3");
    document.getElementById("chat-input")?.focus();
  };
  const onRecognitionEnd = (finalTranscript: string) => {
    console.log(finalTranscript);
    if (finalTranscript) {
      props.setUserInput((prevInput) => prevInput + finalTranscript);
    }
    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
      setIsTranscription(false);
  };
  return (
    <div className={styles["chat-input-actions"]}>
      {couldStop && (
@@ -787,6 +851,17 @@ export function ChatActions(props: {
          icon={<ShortcutkeyIcon />}
        />
      )}
      {config.sttConfig.enable && (
        <ChatAction
          onClick={async () =>
            isListening ? await stopListening() : await startListening()
          }
          text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak}
          icon={isListening ? <VoiceOpenIcon /> : <VoiceCloseIcon />}
          isListening={isListening}
        />
      )}
    </div>
  );
 }
@@ -1516,7 +1591,7 @@ function _Chat() {
    setAttachImages(images);
  }
-  // 快捷键 shortcut keys
+  // 快捷键
  const [showShortcutKeyModal, setShowShortcutKeyModal] = useState(false);
  useEffect(() => {
--- a/app/components/markdown.tsx
+++ b/app/components/markdown.tsx
@@ -193,7 +193,9 @@ function CustomCode(props: { children: any; className?: string }) {
  const renderShowMoreButton = () => {
    if (showToggle && enableCodeFold && collapsed) {
      return (
-        <div className={`show-hide-button ${collapsed ? "collapsed" : "expanded"}`}>
+        <div
          className={`show-hide-button ${collapsed ? "collapsed" : "expanded"}`}
        >
          <button onClick={toggleCollapsed}>{Locale.NewChat.More}</button>
        </div>
      );
--- a/app/components/settings.tsx
+++ b/app/components/settings.tsx
@@ -85,6 +85,7 @@ import { nanoid } from "nanoid";
 import { useMaskStore } from "../store/mask";
 import { ProviderType } from "../utils/cloud";
 import { TTSConfigList } from "./tts-config";
 import { STTConfigList } from "./stt-config";
 function EditPromptModal(props: { id: string; onClose: () => void }) {
  const promptStore = usePromptStore();
@@ -1811,6 +1812,17 @@ export function Settings() {
          />
        </List>
        <List>
          <STTConfigList
            sttConfig={config.sttConfig}
            updateConfig={(updater) => {
              const sttConfig = { ...config.sttConfig };
              updater(sttConfig);
              config.update((config) => (config.sttConfig = sttConfig));
            }}
          />
        </List>
        <DangerItems />
      </div>
    </ErrorBoundary>
--- a/app/components/stt-config.tsx
+++ b/app/components/stt-config.tsx
@@ -0,0 +1,75 @@
 import { STTConfig, STTConfigValidator } from "../store";
 import Locale from "../locales";
 import { ListItem, Select } from "./ui-lib";
 import { DEFAULT_STT_ENGINES, DEFAULT_STT_LANGUAGES } from "../constant";
 import { isFirefox } from "../utils";
 export function STTConfigList(props: {
  sttConfig: STTConfig;
  updateConfig: (updater: (config: STTConfig) => void) => void;
 }) {
  return (
    <>
      <ListItem
        title={Locale.Settings.STT.Enable.Title}
        subTitle={Locale.Settings.STT.Enable.SubTitle}
      >
        <input
          type="checkbox"
          checked={props.sttConfig.enable}
          onChange={(e) =>
            props.updateConfig(
              (config) => (config.enable = e.currentTarget.checked),
            )
          }
        ></input>
      </ListItem>
      <ListItem title={Locale.Settings.STT.Engine.Title}>
        <Select
          value={props.sttConfig.engine}
          onChange={(e) => {
            props.updateConfig(
              (config) =>
                (config.engine = STTConfigValidator.engine(
                  e.currentTarget.value,
                )),
            );
          }}
        >
          {isFirefox()
            ? DEFAULT_STT_ENGINES.filter((v) => v !== "Web Speech API").map(
                (v, i) => (
                  <option value={v} key={i}>
                    {v}
                  </option>
                ),
              )
            : DEFAULT_STT_ENGINES.map((v, i) => (
                <option value={v} key={i}>
                  {v}
                </option>
              ))}
        </Select>
      </ListItem>
      {props.sttConfig.engine === "Web Speech API" && !isFirefox() && (
        <ListItem title="语言选择">
          <Select
            value={props.sttConfig.lang}
            onChange={(e) => {
              props.updateConfig(
                (config) => (config.lang = e.currentTarget.value),
              );
            }}
          >
            {DEFAULT_STT_LANGUAGES.map((v, i) => (
              <option value={v} key={i}>
                {v}
              </option>
            ))}
          </Select>
        </ListItem>
      )}
    </>
  );
 }
--- a/app/components/stt.module.scss
+++ b/app/components/stt.module.scss
@@ -0,0 +1,119 @@
@import "../styles/animation.scss";
 .plugin-page {
  height: 100%;
  display: flex;
  flex-direction: column;
  .plugin-page-body {
    padding: 20px;
    overflow-y: auto;
    .plugin-filter {
      width: 100%;
      max-width: 100%;
      margin-bottom: 20px;
      animation: slide-in ease 0.3s;
      height: 40px;
      display: flex;
      .search-bar {
        flex-grow: 1;
        max-width: 100%;
        min-width: 0;
        outline: none;
      }
      .search-bar:focus {
        border: 1px solid var(--primary);
      }
      .plugin-filter-lang {
        height: 100%;
        margin-left: 10px;
      }
      .plugin-create {
        height: 100%;
        margin-left: 10px;
        box-sizing: border-box;
        min-width: 80px;
      }
    }
    .plugin-item {
      display: flex;
      justify-content: space-between;
      padding: 20px;
      border: var(--border-in-light);
      animation: slide-in ease 0.3s;
      &:not(:last-child) {
        border-bottom: 0;
      }
      &:first-child {
        border-top-left-radius: 10px;
        border-top-right-radius: 10px;
      }
      &:last-child {
        border-bottom-left-radius: 10px;
        border-bottom-right-radius: 10px;
      }
      .plugin-header {
        display: flex;
        align-items: center;
        .plugin-icon {
          display: flex;
          align-items: center;
          justify-content: center;
          margin-right: 10px;
        }
        .plugin-title {
          .plugin-name {
            font-size: 14px;
            font-weight: bold;
          }
          .plugin-info {
            font-size: 12px;
          }
          .plugin-runtime-warning {
            font-size: 12px;
            color: #f86c6c;
          }
        }
      }
      .plugin-actions {
        display: flex;
        flex-wrap: nowrap;
        transition: all ease 0.3s;
        justify-content: center;
        align-items: center;
      }
      @media screen and (max-width: 600px) {
        display: flex;
        flex-direction: column;
        padding-bottom: 10px;
        border-radius: 10px;
        margin-bottom: 20px;
        box-shadow: var(--card-shadow);
        &:not(:last-child) {
          border-bottom: var(--border-in-light);
        }
        .plugin-actions {
          width: 100%;
          justify-content: space-between;
          padding-top: 10px;
        }
      }
    }
  }
 }
--- a/app/constant.ts
+++ b/app/constant.ts
@@ -160,6 +160,7 @@ export const Anthropic = {
 export const OpenaiPath = {
  ChatPath: "v1/chat/completions",
  SpeechPath: "v1/audio/speech",
  TranscriptionPath: "v1/audio/transcriptions",
  ImagePath: "v1/images/generations",
  UsagePath: "dashboard/billing/usage",
  SubsPath: "dashboard/billing/subscription",
@@ -290,6 +291,24 @@ export const DEFAULT_TTS_VOICES = [
  "shimmer",
 ];
 export const DEFAULT_STT_ENGINE = "OpenAI Whisper";
 export const DEFAULT_STT_ENGINES = ["OpenAI Whisper", "Web Speech API"];
 export const DEFAULT_STT_LANGUAGE = "zh-CN";
 export const DEFAULT_STT_LANGUAGES = [
  "zh-CN", // 中文（简体）
  "en-US", // 英文
  "fr-FR", // 法文
  "de-DE", // 德文
  "es-ES", // 西班牙文
  "it-IT", // 意大利文
  "ja-JP", // 日文
  "ko-KR", // 韩文
  "ru-RU", // 俄文
  "pt-BR", // 葡萄牙文（巴西）
  "ar-SA", // 阿拉伯文
 ];
 export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper";
 const openaiModels = [
  "gpt-3.5-turbo",
  "gpt-3.5-turbo-1106",
--- a/app/icons/vioce-close.svg
+++ b/app/icons/vioce-close.svg
@@ -0,0 +1 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="none" stroke="currentColor" stroke-width="1.5"><path d="M7 8a5 5 0 0 1 10 0v3a5 5 0 0 1-10 0z"/><path stroke-linecap="round" d="M11 8h2m-3 3h4m6-1v1a8 8 0 1 1-16 0v-1m8 9v3"/></g></svg>
--- a/app/icons/vioce-open.svg
+++ b/app/icons/vioce-open.svg
@@ -0,0 +1 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="currentColor" fill-rule="evenodd" clip-rule="evenodd"><path d="M4 9a.75.75 0 0 1 .75.75v1a7.25 7.25 0 1 0 14.5 0v-1a.75.75 0 0 1 1.5 0v1a8.75 8.75 0 0 1-8 8.718v2.282a.75.75 0 0 1-1.5 0v-2.282a8.75 8.75 0 0 1-8-8.718v-1A.75.75 0 0 1 4 9"/><path d="M12 2a5.75 5.75 0 0 0-5.75 5.75v3a5.75 5.75 0 0 0 11.5 0v-3A5.75 5.75 0 0 0 12 2m2 9.5a.75.75 0 0 0 0-1.5h-4a.75.75 0 0 0 0 1.5zm-.25-3.75a.75.75 0 0 1-.75.75h-2A.75.75 0 0 1 11 7h2a.75.75 0 0 1 .75.75"/></g></svg>
--- a/app/locales/cn.ts
+++ b/app/locales/cn.ts
@@ -92,8 +92,9 @@ const cn = {
      return inputHints + "，/ 触发补全，: 触发命令";
    },
    Send: "发送",
-    StartSpeak: "说话",
+    StartSpeak: "启用语音输入",
-    StopSpeak: "停止",
+    CloseSpeak: "关闭语音输入",
    StopSpeak: "录音中....点击结束",
    Config: {
      Reset: "清除记忆",
      SaveAs: "存为面具",
@@ -562,6 +563,16 @@ const cn = {
        SubTitle: "生成语音的速度",
      },
    },
    STT: {
      Enable: {
        Title: "启用语音转文本",
        SubTitle: "启用语音转文本",
      },
      Engine: {
        Title: "转换引擎",
        SubTitle: "音频转换引擎",
      },
    },
  },
  Store: {
    DefaultTopic: "新的聊天",
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@@ -94,7 +94,8 @@ const en: LocaleType = {
    },
    Send: "Send",
    StartSpeak: "Start Speak",
-    StopSpeak: "Stop Speak",
+    CloseSpeak: "Stop Speak",
    StopSpeak: "Recording...",
    Config: {
      Reset: "Reset to Default",
      SaveAs: "Save as Mask",
@@ -570,6 +571,16 @@ const en: LocaleType = {
      },
      Engine: "TTS Engine",
    },
    STT: {
      Enable: {
        Title: "Enable STT",
        SubTitle: "Enable Speech-to-Text",
      },
      Engine: {
        Title: "STT Engine",
        SubTitle: "Text-to-Speech Engine",
      },
    },
  },
  Store: {
    DefaultTopic: "New Conversation",
--- a/app/store/config.ts
+++ b/app/store/config.ts
@@ -5,6 +5,9 @@ import {
  DEFAULT_INPUT_TEMPLATE,
  DEFAULT_MODELS,
  DEFAULT_SIDEBAR_WIDTH,
  DEFAULT_STT_ENGINE,
  DEFAULT_STT_ENGINES,
  DEFAULT_STT_LANGUAGE,
  DEFAULT_TTS_ENGINE,
  DEFAULT_TTS_ENGINES,
  DEFAULT_TTS_MODEL,
@@ -20,6 +23,7 @@ export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
 export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
 export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
 export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
 export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];
 export enum SubmitKey {
  Enter = "Enter",
@@ -83,19 +87,25 @@ export const DEFAULT_CONFIG = {
  },
  ttsConfig: {
-    enable: false,
+    enable: true,
    autoplay: false,
    engine: DEFAULT_TTS_ENGINE,
    model: DEFAULT_TTS_MODEL,
    voice: DEFAULT_TTS_VOICE,
    speed: 1.0,
  },
  sttConfig: {
    enable: true,
    engine: DEFAULT_STT_ENGINE,
    lang: DEFAULT_STT_LANGUAGE,
  },
 };
 export type ChatConfig = typeof DEFAULT_CONFIG;
 export type ModelConfig = ChatConfig["modelConfig"];
 export type TTSConfig = ChatConfig["ttsConfig"];
 export type STTConfig = ChatConfig["sttConfig"];
 export function limitNumber(
  x: number,
@@ -125,6 +135,12 @@ export const TTSConfigValidator = {
  },
 };
 export const STTConfigValidator = {
  engine(x: string) {
    return x as STTEngineType;
  },
 };
 export const ModalConfigValidator = {
  model(x: string) {
    return x as ModelType;
--- a/app/utils/speech.ts
+++ b/app/utils/speech.ts
@@ -0,0 +1,126 @@
 import { ChatGPTApi } from "../client/platforms/openai";
 import { getSTTLang } from "../locales";
 import { isFirefox } from "../utils";
 export type TranscriptionCallback = (transcription: string) => void;
 export abstract class SpeechApi {
  protected onTranscription: TranscriptionCallback = () => {};
  abstract isListening(): boolean;
  abstract start(): Promise<void>;
  abstract stop(): Promise<void>;
  onTranscriptionReceived(callback: TranscriptionCallback) {
    this.onTranscription = callback;
  }
 }
 export class OpenAITranscriptionApi extends SpeechApi {
  private listeningStatus = false;
  private mediaRecorder: MediaRecorder | null = null;
  private stream: MediaStream | null = null;
  private audioChunks: Blob[] = [];
  isListening = () => this.listeningStatus;
  constructor(transcriptionCallback?: TranscriptionCallback) {
    super();
    if (transcriptionCallback) {
      this.onTranscriptionReceived(transcriptionCallback);
    }
  }
  async start(): Promise<void> {
    // @ts-ignore
    navigator.getUserMedia =
      // @ts-ignore
      navigator.getUserMedia ||
      // @ts-ignore
      navigator.webkitGetUserMedia ||
      // @ts-ignore
      navigator.mozGetUserMedia ||
      // @ts-ignore
      navigator.msGetUserMedia;
    if (navigator.mediaDevices) {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      this.mediaRecorder = new MediaRecorder(stream);
      this.mediaRecorder.ondataavailable = (e) => {
        if (e.data && e.data.size > 0) {
          this.audioChunks.push(e.data);
        }
      };
      this.stream = stream;
    } else {
      console.warn("Media Decives will work only with SSL");
      return;
    }
    this.audioChunks = [];
    // this.recorder.addEventListener("dataavailable", (event) => {
    //     this.audioChunks.push(event.data);
    // });
    this.mediaRecorder.start(1000);
    this.listeningStatus = true;
  }
  async stop(): Promise<void> {
    if (!this.mediaRecorder || !this.listeningStatus) {
      return;
    }
    return new Promise((resolve) => {
      this.mediaRecorder!.addEventListener("stop", async () => {
        const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
        const llm = new ChatGPTApi();
        const transcription = await llm.transcription({ file: audioBlob });
        this.onTranscription(transcription);
        this.listeningStatus = false;
        resolve();
      });
      this.mediaRecorder!.stop();
    });
  }
 }
 export class WebTranscriptionApi extends SpeechApi {
  private listeningStatus = false;
  private recognitionInstance: any | null = null;
  isListening = () => this.listeningStatus;
  constructor(transcriptionCallback?: TranscriptionCallback, lang?: string) {
    super();
    if (isFirefox()) return;
    const SpeechRecognition =
      (window as any).SpeechRecognition ||
      (window as any).webkitSpeechRecognition;
    this.recognitionInstance = new SpeechRecognition();
    this.recognitionInstance.continuous = true;
    this.recognitionInstance.interimResults = true;
    this.recognitionInstance.lang = lang ?? getSTTLang();
    if (transcriptionCallback) {
      this.onTranscriptionReceived(transcriptionCallback);
    }
    this.recognitionInstance.onresult = (event: any) => {
      const result = event.results[event.results.length - 1];
      if (result.isFinal) {
        this.onTranscription(result[0].transcript);
      }
    };
  }
  async start(): Promise<void> {
    this.listeningStatus = true;
    await this.recognitionInstance.start();
  }
  async stop(): Promise<void> {
    this.listeningStatus = false;
    await this.recognitionInstance.stop();
  }
 }
--- a/public/Recordingdone.mp3
+++ b/public/Recordingdone.mp3
--- a/public/Recordingstart.mp3
+++ b/public/Recordingstart.mp3
		`@@ -0,0 +1 @@`
							`<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="none" stroke="currentColor" stroke-width="1.5"><path d="M7 8a5 5 0 0 1 10 0v3a5 5 0 0 1-10 0z"/><path stroke-linecap="round" d="M11 8h2m-3 3h4m6-1v1a8 8 0 1 1-16 0v-1m8 9v3"/></g></svg>`
		`@@ -0,0 +1 @@`
							<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="currentColor" fill-rule="evenodd" clip-rule="evenodd"><path d="M4 9a.75.75 0 0 1 .75.75v1a7.25 7.25 0 1 0 14.5 0v-1a.75.75 0 0 1 1.5 0v1a8.75 8.75 0 0 1-8 8.718v2.282a.75.75 0 0 1-1.5 0v-2.282a8.75 8.75 0 0 1-8-8.718v-1A.75.75 0 0 1 4 9"/><path d="M12 2a5.75 5.75 0 0 0-5.75 5.75v3a5.75 5.75 0 0 0 11.5 0v-3A5.75 5.75 0 0 0 12 2m2 9.5a.75.75 0 0 0 0-1.5h-4a.75.75 0 0 0 0 1.5zm-.25-3.75a.75.75 0 0 1-.75.75h-2A.75.75 0 0 1 11 7h2a.75.75 0 0 1 .75.75"/></g></svg>