增加语音输入

2025-11-18 15:03:43 +08:00 · 2024-03-28 00:48:40 +08:00
parent 7ffabb77f9
commit 8620df325d
10 changed files with 217 additions and 110 deletions
--- a/app/app/(admin)/admin/t/page.tsx
+++ b/app/app/(admin)/admin/t/page.tsx
@@ -1,16 +0,0 @@
-import { Flex } from "antd";
-import VoiceInput from "@/app/components/voice-input";
-
-export default async function UsersPage() {
-  // const users: User[] = await getData();
-
-  // console.log("data", data);
-
-  return (
-    <>
-      <Flex gap="middle" vertical>
-        <VoiceInput />
-      </Flex>
-    </>
-  );
-}
--- a/app/components/chat.module.scss
+++ b/app/components/chat.module.scss
@@ -643,7 +643,7 @@
  background-color: var(--white);
  color: var(--black);
  font-family: inherit;
-  padding: 10px 90px 10px 14px;
+  padding: 10px 120px 10px 14px;
  resize: none;
  outline: none;
  box-sizing: border-box;
@@ -661,6 +661,14 @@
  bottom: 32px;
 }

+.chat-input-send-area {
+  color: white;
+
+  position: absolute;
+  right: 100px;
+  bottom: 32px;
+}
+
@media only screen and (max-width: 600px) {
  .chat-input {
    font-size: 16px;
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -98,11 +98,14 @@ import { ChatCommandPrefix, useChatCommand, useCommand } from "../command";
 import { prettyObject } from "../utils/format";
 import { ExportMessageModal } from "./exporter";
 import { getClientConfig } from "../config/client";
-import { Button } from "emoji-picker-react/src/components/atoms/Button";
 import Image from "next/image";
 import { useAllModels } from "../utils/hooks";
 import { MultimodalContent } from "../client/api";
 import { getTokenLength } from "@/lib/utils";
+import VoiceInput from "@/app/components/voice-input";
+
+// const VoiceInput = dynamic(
+//     () => import('@/app/components/voice-input'), { ssr: false });

 const Markdown = dynamic(async () => (await import("./markdown")).Markdown, {
  loading: () => <LoadingIcon />,
@@ -1251,6 +1254,21 @@ function _Chat() {
    setAttachImages(images);
  }

+  // const [ voiceInputText, setVoiceInputText ] = useState("");
+  // const [ voiceInputLoading, setVoiceInputLoading ] = useState(false);
+
+  // useEffect(() => {
+  //   if (voiceInputLoading) {
+  //     // 正在进行语音输入，输入框应该显示原有文本加上语音输入的。
+  //     setUserInput(userInput + voiceInputText);
+  //   } else {
+  //     // 但是语音输入结束，应该清理多余字符。
+  //     console.log('end', userInput, voiceInputText)
+  //   }
+  //
+  //   // eslint-disable-next-line react-hooks/exhaustive-deps
+  // }, [voiceInputLoading, voiceInputText]);
+
  return (
    <div className={styles.chat} key={session.id}>
      <div className="window-header" data-tauri-drag-region>
@@ -1688,12 +1706,16 @@ function _Chat() {
              })}
            </div>
          )}
-          <IconButton
-            icon={<SendWhiteIcon />}
-            text={Locale.Chat.Send}
-            type="primary"
-            onClick={() => doSubmit(userInput)}
+          <div className={styles["chat-input-send-area"]}>
+            <VoiceInput
+              // voiceInputText={voiceInputText}
+              // setVoiceInputText={setVoiceInputText}
+              // voiceInputLoading={voiceInputLoading}
+              // setVoiceInputLoading={setVoiceInputLoading}
+              userInput={userInput}
+              setUserInput={setUserInput}
            />
+          </div>
          <IconButton
            icon={<SendWhiteIcon />}
            text={Locale.Chat.Send}
--- a/app/components/voice-input.tsx
+++ b/app/components/voice-input.tsx
@@ -1,82 +1,172 @@
-"use client";
+// "use client";
 import { Button, Input, Space } from "antd";
-import { useEffect, useMemo, useRef, useState } from "react";
+import {
+  Dispatch,
+  SetStateAction,
+  useEffect,
+  useMemo,
+  useRef,
+  useState,
+} from "react";
 import { AudioOutlined, LoadingOutlined } from "@ant-design/icons";
+import * as ms_audio_sdk from "microsoft-cognitiveservices-speech-sdk";
+import {
+  Recognizer,
+  SpeechRecognitionCanceledEventArgs,
+  SpeechRecognitionEventArgs,
+  SpeechRecognitionResult,
+} from "microsoft-cognitiveservices-speech-sdk/distrib/lib/src/sdk/Exports";
+import { useAccessStore } from "@/app/store";

-export default function VoiceInput() {
-  const [userInput, setUserInput] = useState("");
-  const [loading, setLoading] = useState(false);
-  const recognition = useRef(null);
+interface VoiceInputInterface {
+  userInput: string;
+  setUserInput: Dispatch<SetStateAction<string>>;
+}

-  const lastLength = useRef(0);
+// @ts-ignore
+export default function VoiceInput({
+  userInput,
+  setUserInput,
+}: VoiceInputInterface) {
+  const [voiceInputText, setVoiceInputText] = useState("");
+  const [voiceInputLoading, setVoiceInputLoading] = useState(false);
+  // const recognition = useRef(null);
+  const recognizer = useRef<ms_audio_sdk.SpeechRecognizer | undefined>();
+  const [tempUserInput, setTempUserInput] = useState("");
+  const accessStore = useAccessStore();
+  // const lastLength = useRef(0);

-  useEffect(() => {
-    if ("webkitSpeechRecognition" in window) {
-      if (recognition.current === null) {
-        recognition.current = new window.webkitSpeechRecognition();
+  // useEffect(() => {
+  //
+  //   function onresult(event: any) {
+  //     // 这个事件会把前面识别的结果都返回回来，所以需要取最后一个识别结果
+  //     const length = event.results.length;
+  //     // 没有新的识别结果的时候，事件也会触发，所以这里判断一下如果没有新的识别结果，就不取最后一个识别结果了。
+  //     if (lastLength.current === length) {
+  //       return;
+  //     }
+  //
+  //     lastLength.current = length;
+  //
+  //     console.log(event.results);
+  //
+  //     // 获取最后一个识别结果
+  //     const transcript = event.results[length - 1]?.[0]?.transcript;
+  //
+  //     // 将最后一个识别结果添加到文本
+  //     if (transcript) {
+  //       setVoiceInputText((voiceInputText) => voiceInputText + transcript);
+  //     }
+  //   }
+  //
+  // }, []);
+
+  function onRecognizedResult(result: SpeechRecognitionResult) {
+    // setVoiceInputText("");
+    setVoiceInputText(`${result.text}`);
+
+    let intentJson = result.properties.getProperty(
+      ms_audio_sdk.PropertyId.LanguageUnderstandingServiceResponse_JsonResult,
+    );
+    if (intentJson) {
+      setVoiceInputText(voiceInputText + `${intentJson}`);
    }
-    } else {
-      console.error("此浏览器不支持webkitSpeechRecognition。");
-      return;
-    }
-    if (!recognition.current) return;
-    // 设置语言
-    recognition.current.lang = "zh";
-    // 开启连续识别
-    recognition.current.continuous = true;
-    // 开启实时识别
-    recognition.current.interimResults = true;

-    function onresult(event: any) {
-      // 这个事件会把前面识别的结果都返回回来，所以需要取最后一个识别结果
-      const length = event.results.length;
-      // 没有新的识别结果的时候，事件也会触发，所以这里判断一下如果没有新的识别结果，就不取最后一个识别结果了。
-      if (lastLength.current === length) {
+    // setTempUserInput("");
+    console.log("3333", tempUserInput, "2", voiceInputText);
+
+    // if (result?.translations) {
+    //   let resultJson = JSON.parse(result.json);
+    //   resultJson['privTranslationPhrase']['Translation']['Translations'].forEach(
+    //       function (translation: { Language: any; Text: any; }) {
+    //         setVoiceInputText(voiceInputText + ` [${translation.Language}] ${translation.Text}\r\n`);
+    //       });
+    // }
+  }
+  function onCanceled(
+    sender: Recognizer,
+    event: SpeechRecognitionCanceledEventArgs,
+  ) {
+    console.log(event);
+
+    // 展示取消事件
+    // statusDiv.innerHTML += "(cancel) Reason: " + ms_audio_sdk.CancellationReason[event.reason];
+    // if (event.reason === ms_audio_sdk.CancellationReason.Error) {
+    //   statusDiv.innerHTML += ": " + event.errorDetails;
+    // }
+    // statusDiv.innerHTML += "\r\n";
+  }
+  function onRecognizing(
+    sender: Recognizer,
+    event: SpeechRecognitionEventArgs,
+  ) {
+    let result = event.result;
+    setUserInput(
+      tempUserInput +
+        voiceInputText.replace(/(.*)(^|[\r\n]+).*\[\.\.\.][\r\n]+/, "$1$2") +
+        `${result.text} [...]`,
+    );
+
+    setVoiceInputText(
+      voiceInputText.replace(/(.*)(^|[\r\n]+).*\[\.\.\.][\r\n]+/, "$1$2") +
+        `${result.text} [...]`,
+    );
+  }
+
+  const startRecognition = () => {
+    if (voiceInputLoading) {
+      recognizer.current?.close();
+      setVoiceInputLoading(false);
+      // setVoiceInputText("");
+      // setUserInput(tempUserInput);
      return;
    }

-      lastLength.current = length;
+    setVoiceInputLoading(true);
+    setTempUserInput(userInput); // 开始的时候拷贝一份用于复原
+    setVoiceInputText("");

-      console.log(event.results);
-
-      // 获取最后一个识别结果
-      const transcript = event.results[length - 1]?.[0]?.transcript;
-
-      // 将最后一个识别结果添加到文本
-      if (transcript) {
-        setUserInput((userInput) => userInput + transcript);
-      }
-    }
-
-    // 监听语音识别结果
-    recognition.current.addEventListener("result", onresult);
-
-    return () => {
-      if (recognition.current) {
-        recognition.current.removeEventListener("result", onresult);
-      }
+    const speechConfig = ms_audio_sdk.SpeechConfig.fromSubscription(
+      accessStore.azureVoiceKey,
+      "eastasia",
+    );
+    const audioConfig = ms_audio_sdk.AudioConfig.fromDefaultMicrophoneInput();
+    speechConfig.speechRecognitionLanguage = "zh-CN";
+    speechConfig.setProperty(
+      ms_audio_sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs,
+      "2500",
+    );
+    recognizer.current = new ms_audio_sdk.SpeechRecognizer(
+      speechConfig,
+      audioConfig,
+    );
+    recognizer.current.recognizing = onRecognizing; // 自定义分段显示
+    recognizer.current.canceled = onCanceled; // 自定义中断
+    recognizer.current.recognizeOnceAsync(
+      (result) => {
+        // onRecognizedResult(result);
+        setVoiceInputText(`${result.text}`);
+        console.log("3333", tempUserInput, "2", voiceInputText);
+        setUserInput(tempUserInput + voiceInputText + `${result.text}`);
+        // setVoiceInputText(result.text);
+        console.log("result", result.text);
+        setVoiceInputLoading(false);
+        // recognizer.close();
+      },
+      (err) => {
+        console.error("Recognition error: ", err); // 错误处理
+        setVoiceInputLoading(false);
+      },
+    );
  };
-  }, []);
-
-  function click() {
-    if (loading) {
-      recognition.current.stop();
-      setLoading(false);
-      return;
-    }
-    setLoading(true);
-
-    lastLength.current = 0;
-    recognition.current.start();
-  }

  const icon = useMemo(() => {
-    if (loading) {
+    if (voiceInputLoading) {
      return (
        <LoadingOutlined
          style={{
            fontSize: 16,
-            color: "#ffffff",
+            color: "rgb(234, 149, 24)",
          }}
        />
      );
@@ -85,17 +175,17 @@ export default function VoiceInput() {
      <AudioOutlined
        style={{
          fontSize: 16,
-          color: "#ffffff",
+          color: "rgb(234, 149, 24)",
        }}
      />
    );
-  }, [loading]);
+  }, [voiceInputLoading]);

  return (
-    <div style={{ textAlign: "center", marginTop: 200 }}>
-      <Space.Compact style={{ width: 600 }}>
-        <Input size="large" value={userInput} />
-        <Button size="large" type="primary" onClick={click} icon={icon} />
+    <div>
+      <Space.Compact>
+        {/*<Input value={voiceInputText} />*/}
+        <Button type="text" onClick={startRecognition} icon={icon} />
      </Space.Compact>
    </div>
  );
--- a/app/config/server.ts
+++ b/app/config/server.ts
@@ -27,6 +27,7 @@ declare global {
      AZURE_URL?: string; // https://{azure-url}/openai/deployments/{deploy-name}
      AZURE_API_KEY?: string;
      AZURE_API_VERSION?: string;
+      AZURE_VOICE_KEY?: string;

      // google only
      GOOGLE_API_KEY?: string;
@@ -93,6 +94,7 @@ export const getServerSideConfig = () => {
    azureUrl: process.env.AZURE_URL ?? "",
    azureApiKey: process.env.AZURE_API_KEY ?? "",
    azureApiVersion: process.env.AZURE_API_VERSION ?? "",
+    azureVoiceKey: process.env.AZURE_VOICE_KEY ?? "",

    isGoogle,
    googleApiKey: process.env.GOOGLE_API_KEY,
--- a/app/constant.ts
+++ b/app/constant.ts
@@ -107,7 +107,7 @@ Latex inline: $x^2$
 Latex block: $$e=mc^2$$
 `;

-export const SUMMARIZE_MODEL = "gpt-3.5-turbo-1106";
+export const SUMMARIZE_MODEL = "gpt-3.5-turbo-0125";
 export const GEMINI_SUMMARIZE_MODEL = "gemini-pro";

 export const KnowledgeCutOffDate: Record<string, string> = {
@@ -132,8 +132,8 @@ export const DEFAULT_MODELS = [
  //   available: false,
  // },
  {
-    name: "gpt-3.5-turbo-1106",
-    describe: "GPT-3,最快,笨,最便宜",
+    name: "gpt-3.5-turbo-0125",
+    describe: "GPT-3,最快,效果一般,最便宜",
    available: true,
    provider: {
      id: "openai",
--- a/app/masks/cn.ts
+++ b/app/masks/cn.ts
@@ -58,7 +58,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -84,7 +84,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -110,7 +110,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -136,7 +136,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -162,7 +162,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -188,7 +188,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -214,7 +214,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -240,7 +240,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -272,7 +272,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 0.5,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -298,7 +298,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -331,7 +331,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -364,7 +364,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
@@ -422,7 +422,7 @@ export const CN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 1,
      max_tokens: 2000,
      presence_penalty: 0,
--- a/app/masks/en.ts
+++ b/app/masks/en.ts
@@ -86,7 +86,7 @@ export const EN_MASKS: BuiltinMask[] = [
      },
    ],
    modelConfig: {
-      model: "gpt-3.5-turbo-1106",
+      model: "gpt-3.5-turbo-0125",
      temperature: 0.5,
      max_tokens: 2000,
      presence_penalty: 0,
--- a/app/store/access.ts
+++ b/app/store/access.ts
@@ -30,6 +30,7 @@ const DEFAULT_ACCESS_STATE = {
  azureUrl: "",
  azureApiKey: "",
  azureApiVersion: "2023-05-15",
+  azureVoiceKey: "",

  // google ai studio
  googleUrl: "",
--- a/app/store/config.ts
+++ b/app/store/config.ts
@@ -51,7 +51,7 @@ export const DEFAULT_CONFIG = {
  dontUseModel: DISABLE_MODELS,

  modelConfig: {
-    model: "gpt-3.5-turbo-1106" as ModelType,
+    model: "gpt-3.5-turbo-0125" as ModelType,
    temperature: 0.8,
    top_p: 1,
    max_tokens: 2000,
@@ -137,7 +137,7 @@ export const useAppConfig = createPersistStore(
  }),
  {
    name: StoreKey.Config,
-    version: 3.8991,
+    version: 3.8992,
    migrate(persistedState, version) {
      const state = persistedState as ChatConfig;

@@ -168,7 +168,7 @@ export const useAppConfig = createPersistStore(
      if (version < 3.8) {
        state.lastUpdate = Date.now();
      }
-      if (version < 3.8991) {
+      if (version < 3.8992) {
        state.lastUpdate = Date.now();
        return { ...DEFAULT_CONFIG };
      }