feat: openai realtime merge

This commit is contained in:
Hk-Gosuto
2024-12-23 15:48:21 +08:00
parent c6156a8d8a
commit 21bf685d12
30 changed files with 2418 additions and 833 deletions

View File

@@ -0,0 +1 @@
export * from "./realtime-chat";

View File

@@ -0,0 +1,74 @@
.realtime-chat {
width: 100%;
justify-content: center;
align-items: center;
position: relative;
display: flex;
flex-direction: column;
height: 100%;
padding: 20px;
box-sizing: border-box;
.circle-mic {
width: 150px;
height: 150px;
border-radius: 50%;
background: linear-gradient(to bottom right, #a0d8ef, #f0f8ff);
display: flex;
justify-content: center;
align-items: center;
}
.icon-center {
font-size: 24px;
}
.bottom-icons {
display: flex;
justify-content: space-between;
align-items: center;
width: 100%;
position: absolute;
bottom: 20px;
box-sizing: border-box;
padding: 0 20px;
}
.icon-left,
.icon-right {
width: 46px;
height: 46px;
font-size: 36px;
background: var(--second);
border-radius: 50%;
padding: 2px;
display: flex;
justify-content: center;
align-items: center;
cursor: pointer;
&:hover {
opacity: 0.8;
}
}
&.mobile {
display: none;
}
}
.pulse {
animation: pulse 1.5s infinite;
}
@keyframes pulse {
0% {
transform: scale(1);
opacity: 0.7;
}
50% {
transform: scale(1.1);
opacity: 1;
}
100% {
transform: scale(1);
opacity: 0.7;
}
}

View File

@@ -0,0 +1,359 @@
import VoiceIcon from "@/app/icons/voice.svg";
import VoiceOffIcon from "@/app/icons/voice-off.svg";
import PowerIcon from "@/app/icons/power.svg";
import styles from "./realtime-chat.module.scss";
import clsx from "clsx";
import { useState, useRef, useEffect } from "react";
import { useChatStore, createMessage, useAppConfig } from "@/app/store";
import { IconButton } from "@/app/components/button";
import {
Modality,
RTClient,
RTInputAudioItem,
RTResponse,
TurnDetection,
} from "rt-client";
import { AudioHandler } from "@/app/lib/audio";
import { uploadImage } from "@/app/utils/chat";
import { VoicePrint } from "@/app/components/voice-print";
interface RealtimeChatProps {
onClose?: () => void;
onStartVoice?: () => void;
onPausedVoice?: () => void;
}
export function RealtimeChat({
onClose,
onStartVoice,
onPausedVoice,
}: RealtimeChatProps) {
const chatStore = useChatStore();
const session = chatStore.currentSession();
const config = useAppConfig();
const [status, setStatus] = useState("");
const [isRecording, setIsRecording] = useState(false);
const [isConnected, setIsConnected] = useState(false);
const [isConnecting, setIsConnecting] = useState(false);
const [modality, setModality] = useState("audio");
const [useVAD, setUseVAD] = useState(true);
const [frequencies, setFrequencies] = useState<Uint8Array | undefined>();
const clientRef = useRef<RTClient | null>(null);
const audioHandlerRef = useRef<AudioHandler | null>(null);
const initRef = useRef(false);
const temperature = config.realtimeConfig.temperature;
const apiKey = config.realtimeConfig.apiKey;
const model = config.realtimeConfig.model;
const azure = config.realtimeConfig.provider === "Azure";
const azureEndpoint = config.realtimeConfig.azure.endpoint;
const azureDeployment = config.realtimeConfig.azure.deployment;
const voice = config.realtimeConfig.voice;
const handleConnect = async () => {
if (isConnecting) return;
if (!isConnected) {
try {
setIsConnecting(true);
clientRef.current = azure
? new RTClient(
new URL(azureEndpoint),
{ key: apiKey },
{ deployment: azureDeployment },
)
: new RTClient({ key: apiKey }, { model });
const modalities: Modality[] =
modality === "audio" ? ["text", "audio"] : ["text"];
const turnDetection: TurnDetection = useVAD
? { type: "server_vad" }
: null;
await clientRef.current.configure({
instructions: "",
voice,
input_audio_transcription: { model: "whisper-1" },
turn_detection: turnDetection,
tools: [],
temperature,
modalities,
});
startResponseListener();
setIsConnected(true);
// TODO
// try {
// const recentMessages = chatStore.getMessagesWithMemory();
// for (const message of recentMessages) {
// const { role, content } = message;
// if (typeof content === "string") {
// await clientRef.current.sendItem({
// type: "message",
// role: role as any,
// content: [
// {
// type: (role === "assistant" ? "text" : "input_text") as any,
// text: content as string,
// },
// ],
// });
// }
// }
// // await clientRef.current.generateResponse();
// } catch (error) {
// console.error("Set message failed:", error);
// }
} catch (error) {
console.error("Connection failed:", error);
setStatus("Connection failed");
} finally {
setIsConnecting(false);
}
} else {
await disconnect();
}
};
const disconnect = async () => {
if (clientRef.current) {
try {
await clientRef.current.close();
clientRef.current = null;
setIsConnected(false);
} catch (error) {
console.error("Disconnect failed:", error);
}
}
};
const startResponseListener = async () => {
if (!clientRef.current) return;
try {
for await (const serverEvent of clientRef.current.events()) {
if (serverEvent.type === "response") {
await handleResponse(serverEvent);
} else if (serverEvent.type === "input_audio") {
await handleInputAudio(serverEvent);
}
}
} catch (error) {
if (clientRef.current) {
console.error("Response iteration error:", error);
}
}
};
const handleResponse = async (response: RTResponse) => {
for await (const item of response) {
if (item.type === "message" && item.role === "assistant") {
const botMessage = createMessage({
role: item.role,
content: "",
});
// add bot message first
chatStore.updateTargetSession(session, (session) => {
session.messages = session.messages.concat([botMessage]);
});
let hasAudio = false;
for await (const content of item) {
if (content.type === "text") {
for await (const text of content.textChunks()) {
botMessage.content += text;
}
} else if (content.type === "audio") {
const textTask = async () => {
for await (const text of content.transcriptChunks()) {
botMessage.content += text;
}
};
const audioTask = async () => {
audioHandlerRef.current?.startStreamingPlayback();
for await (const audio of content.audioChunks()) {
hasAudio = true;
audioHandlerRef.current?.playChunk(audio);
}
};
await Promise.all([textTask(), audioTask()]);
}
// update message.content
chatStore.updateTargetSession(session, (session) => {
session.messages = session.messages.concat();
});
}
if (hasAudio) {
// upload audio get audio_url
const blob = audioHandlerRef.current?.savePlayFile();
uploadImage(blob!).then((audio_url) => {
botMessage.audio_url = audio_url;
// update text and audio_url
chatStore.updateTargetSession(session, (session) => {
session.messages = session.messages.concat();
});
});
}
}
}
};
const handleInputAudio = async (item: RTInputAudioItem) => {
await item.waitForCompletion();
if (item.transcription) {
const userMessage = createMessage({
role: "user",
content: item.transcription,
});
chatStore.updateTargetSession(session, (session) => {
session.messages = session.messages.concat([userMessage]);
});
// save input audio_url, and update session
const { audioStartMillis, audioEndMillis } = item;
// upload audio get audio_url
const blob = audioHandlerRef.current?.saveRecordFile(
audioStartMillis,
audioEndMillis,
);
uploadImage(blob!).then((audio_url) => {
userMessage.audio_url = audio_url;
chatStore.updateTargetSession(session, (session) => {
session.messages = session.messages.concat();
});
});
}
// stop streaming play after get input audio.
audioHandlerRef.current?.stopStreamingPlayback();
};
const toggleRecording = async () => {
if (!isRecording && clientRef.current) {
try {
if (!audioHandlerRef.current) {
audioHandlerRef.current = new AudioHandler();
await audioHandlerRef.current.initialize();
}
await audioHandlerRef.current.startRecording(async (chunk) => {
await clientRef.current?.sendAudio(chunk);
});
setIsRecording(true);
} catch (error) {
console.error("Failed to start recording:", error);
}
} else if (audioHandlerRef.current) {
try {
audioHandlerRef.current.stopRecording();
if (!useVAD) {
const inputAudio = await clientRef.current?.commitAudio();
await handleInputAudio(inputAudio!);
await clientRef.current?.generateResponse();
}
setIsRecording(false);
} catch (error) {
console.error("Failed to stop recording:", error);
}
}
};
useEffect(() => {
// 防止重复初始化
if (initRef.current) return;
initRef.current = true;
const initAudioHandler = async () => {
const handler = new AudioHandler();
await handler.initialize();
audioHandlerRef.current = handler;
await handleConnect();
await toggleRecording();
};
initAudioHandler().catch((error) => {
setStatus(error);
console.error(error);
});
return () => {
if (isRecording) {
toggleRecording();
}
audioHandlerRef.current?.close().catch(console.error);
disconnect();
};
}, []);
useEffect(() => {
let animationFrameId: number;
if (isConnected && isRecording) {
const animationFrame = () => {
if (audioHandlerRef.current) {
const freqData = audioHandlerRef.current.getByteFrequencyData();
setFrequencies(freqData);
}
animationFrameId = requestAnimationFrame(animationFrame);
};
animationFrameId = requestAnimationFrame(animationFrame);
} else {
setFrequencies(undefined);
}
return () => {
if (animationFrameId) {
cancelAnimationFrame(animationFrameId);
}
};
}, [isConnected, isRecording]);
// update session params
useEffect(() => {
clientRef.current?.configure({ voice });
}, [voice]);
useEffect(() => {
clientRef.current?.configure({ temperature });
}, [temperature]);
const handleClose = async () => {
onClose?.();
if (isRecording) {
await toggleRecording();
}
disconnect().catch(console.error);
};
return (
<div className={styles["realtime-chat"]}>
<div
className={clsx(styles["circle-mic"], {
[styles["pulse"]]: isRecording,
})}
>
<VoicePrint frequencies={frequencies} isActive={isRecording} />
</div>
<div className={styles["bottom-icons"]}>
<div>
<IconButton
icon={isRecording ? <VoiceIcon /> : <VoiceOffIcon />}
onClick={toggleRecording}
disabled={!isConnected}
shadow
bordered
/>
</div>
<div className={styles["icon-center"]}>{status}</div>
<div>
<IconButton
icon={<PowerIcon />}
onClick={handleClose}
shadow
bordered
/>
</div>
</div>
</div>
);
}

View File

@@ -0,0 +1,173 @@
import { RealtimeConfig } from "@/app/store";
import Locale from "@/app/locales";
import { ListItem, Select, PasswordInput } from "@/app/components/ui-lib";
import { InputRange } from "@/app/components/input-range";
import { Voice } from "rt-client";
import { ServiceProvider } from "@/app/constant";
const providers = [ServiceProvider.OpenAI, ServiceProvider.Azure];
const models = ["gpt-4o-realtime-preview-2024-10-01"];
const voice = ["alloy", "shimmer", "echo"];
export function RealtimeConfigList(props: {
realtimeConfig: RealtimeConfig;
updateConfig: (updater: (config: RealtimeConfig) => void) => void;
}) {
const azureConfigComponent = props.realtimeConfig.provider ===
ServiceProvider.Azure && (
<>
<ListItem
title={Locale.Settings.Realtime.Azure.Endpoint.Title}
subTitle={Locale.Settings.Realtime.Azure.Endpoint.SubTitle}
>
<input
value={props.realtimeConfig?.azure?.endpoint}
type="text"
placeholder={Locale.Settings.Realtime.Azure.Endpoint.Title}
onChange={(e) => {
props.updateConfig(
(config) => (config.azure.endpoint = e.currentTarget.value),
);
}}
/>
</ListItem>
<ListItem
title={Locale.Settings.Realtime.Azure.Deployment.Title}
subTitle={Locale.Settings.Realtime.Azure.Deployment.SubTitle}
>
<input
value={props.realtimeConfig?.azure?.deployment}
type="text"
placeholder={Locale.Settings.Realtime.Azure.Deployment.Title}
onChange={(e) => {
props.updateConfig(
(config) => (config.azure.deployment = e.currentTarget.value),
);
}}
/>
</ListItem>
</>
);
return (
<>
<ListItem
title={Locale.Settings.Realtime.Enable.Title}
subTitle={Locale.Settings.Realtime.Enable.SubTitle}
>
<input
type="checkbox"
checked={props.realtimeConfig.enable}
onChange={(e) =>
props.updateConfig(
(config) => (config.enable = e.currentTarget.checked),
)
}
></input>
</ListItem>
{props.realtimeConfig.enable && (
<>
<ListItem
title={Locale.Settings.Realtime.Provider.Title}
subTitle={Locale.Settings.Realtime.Provider.SubTitle}
>
<Select
aria-label={Locale.Settings.Realtime.Provider.Title}
value={props.realtimeConfig.provider}
onChange={(e) => {
props.updateConfig(
(config) =>
(config.provider = e.target.value as ServiceProvider),
);
}}
>
{providers.map((v, i) => (
<option value={v} key={i}>
{v}
</option>
))}
</Select>
</ListItem>
<ListItem
title={Locale.Settings.Realtime.Model.Title}
subTitle={Locale.Settings.Realtime.Model.SubTitle}
>
<Select
aria-label={Locale.Settings.Realtime.Model.Title}
value={props.realtimeConfig.model}
onChange={(e) => {
props.updateConfig((config) => (config.model = e.target.value));
}}
>
{models.map((v, i) => (
<option value={v} key={i}>
{v}
</option>
))}
</Select>
</ListItem>
<ListItem
title={Locale.Settings.Realtime.ApiKey.Title}
subTitle={Locale.Settings.Realtime.ApiKey.SubTitle}
>
<PasswordInput
aria={Locale.Settings.ShowPassword}
aria-label={Locale.Settings.Realtime.ApiKey.Title}
value={props.realtimeConfig.apiKey}
type="text"
placeholder={Locale.Settings.Realtime.ApiKey.Placeholder}
onChange={(e) => {
props.updateConfig(
(config) => (config.apiKey = e.currentTarget.value),
);
}}
/>
</ListItem>
{azureConfigComponent}
<ListItem
title={Locale.Settings.TTS.Voice.Title}
subTitle={Locale.Settings.TTS.Voice.SubTitle}
>
<Select
value={props.realtimeConfig.voice}
onChange={(e) => {
props.updateConfig(
(config) => (config.voice = e.currentTarget.value as Voice),
);
}}
>
{voice.map((v, i) => (
<option value={v} key={i}>
{v}
</option>
))}
</Select>
</ListItem>
<ListItem
title={Locale.Settings.Realtime.Temperature.Title}
subTitle={Locale.Settings.Realtime.Temperature.SubTitle}
>
<InputRange
aria={Locale.Settings.Temperature.Title}
value={props.realtimeConfig?.temperature?.toFixed(1)}
min="0.6"
max="1"
step="0.1"
onChange={(e) => {
props.updateConfig(
(config) =>
(config.temperature = e.currentTarget.valueAsNumber),
);
}}
></InputRange>
</ListItem>
</>
)}
</>
);
}