working
This commit is contained in:
parent
852a5e0ebf
commit
1de375492c
19 changed files with 594 additions and 626 deletions
23
README.md
23
README.md
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
# 🚀 ElatoAI: Realtime Speech AI Agents for ESP32
|
||||
|
||||
Realtime AI Speech powered by OpenAI Realtime API, ESP32, Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted global conversations
|
||||
Realtime AI Speech powered by **OpenAI Realtime API** and **Gemini Live API**, ESP32, Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted global conversations
|
||||
|
||||
|
||||
<div align="center">
|
||||
|
|
@ -29,7 +29,17 @@ Realtime AI Speech powered by OpenAI Realtime API, ESP32, Secure WebSockets, and
|
|||
|
||||
</div>
|
||||
|
||||
## ⚡️ DIY Hardware Design
|
||||
## ⚡️ Realtime AI Speech Models on an ESP32
|
||||
|
||||
<div align="center" class="flex flex-row gap-4">
|
||||
|
||||
<img src="assets/openai.png" alt="OpenAI Realtime API" width="45%">
|
||||
|
||||
<img src="assets/gemini.png" alt="Gemini Live API" width="45%">
|
||||
|
||||
</div>
|
||||
|
||||
## 👷♀️ DIY Hardware Design
|
||||
|
||||
<img src="assets/pcb-design.png" alt="Hardware Setup" width="100%">
|
||||
|
||||
|
|
@ -102,6 +112,7 @@ cp .env.example .env
|
|||
# In .env, set your environment variables
|
||||
# SUPABASE_KEY=<your-supabase-anon-key>
|
||||
# OPENAI_API_KEY=<your-openai-api-key>
|
||||
# GEMINI_API_KEY=<your-gemini-api-key>
|
||||
|
||||
# Run the server at port 8000
|
||||
deno run -A --env-file=.env main.ts
|
||||
|
|
@ -143,13 +154,13 @@ Once your Wifi credentials are configured, turn the device off and on again and
|
|||
ElatoAI consists of three main components:
|
||||
|
||||
1. **Frontend Client** (`Next.js` hosted on Vercel) - to create and talk to your AI agents and 'send' it to your ESP32 device
|
||||
2. **Edge Server Functions** (`Deno` running on Deno/Supabase Edge) - to handle the websocket connections from the ESP32 device and the OpenAI API calls
|
||||
3. **ESP32 IoT Client** (`PlatformIO/Arduino`) - to receive the websocket connections from the Edge Server Functions and send audio to the OpenAI API via the Deno edge server.
|
||||
2. **Edge Server Functions** (`Deno` running on Deno/Supabase Edge) - to handle the websocket connections from the ESP32 device and the OpenAI and Gemini API calls
|
||||
3. **ESP32 IoT Client** (`PlatformIO/Arduino`) - to receive the websocket connections from the Edge Server Functions and send audio to the OpenAI and Gemini API via the Deno edge server.
|
||||
|
||||
|
||||
## 🌟 Features
|
||||
|
||||
1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime APIs.
|
||||
1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime API and Gemini's Live API.
|
||||
2. **Create Custom AI Agents**: Create custom agents with different personalities and voices.
|
||||
3. **Customizable Voices**: Choose from a variety of voices and personalities.
|
||||
4. **Secure WebSockets**: Reliable, encrypted WebSocket communication.
|
||||
|
|
@ -200,7 +211,9 @@ flowchart TD
|
|||
UserInput --> ESP32
|
||||
ESP32[ESP32 Device] -->|WebSocket| Edge[Deno Edge Function]
|
||||
Edge -->|OpenAI API| OpenAI[OpenAI Realtime API]
|
||||
Edge -->|Gemini API| Gemini[Gemini Live API]
|
||||
OpenAI --> Edge
|
||||
Gemini --> Edge
|
||||
Edge -->|WebSocket| ESP32
|
||||
ESP32 --> UserOutput
|
||||
```
|
||||
|
|
|
|||
BIN
assets/gemini.png
Normal file
BIN
assets/gemini.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 112 KiB |
BIN
assets/openai.png
Normal file
BIN
assets/openai.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 849 KiB After Width: | Height: | Size: 861 KiB |
|
|
@ -167,6 +167,7 @@ export async function GET(request: NextRequest) {
|
|||
}),
|
||||
},
|
||||
);
|
||||
console.log(response);
|
||||
const data = await response.json();
|
||||
return NextResponse.json(data);
|
||||
} catch (error) {
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ import { v4 as uuidv4 } from 'uuid';
|
|||
import { toast } from "@/components/ui/use-toast";
|
||||
import { useRouter } from "next/navigation";
|
||||
import { z } from "zod";
|
||||
import { emotionOptions, r2UrlAudio, voices } from "@/lib/data";
|
||||
import { emotionOptions, geminiVoices, openaiVoices, r2UrlAudio, VoiceType } from "@/lib/data";
|
||||
import EmojiComponent from "./EmojiComponent";
|
||||
import { PitchFactors } from "@/lib/utils";
|
||||
import { Slider } from "@/components/ui/slider";
|
||||
|
|
@ -166,7 +166,7 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
|
|||
|
||||
try {
|
||||
const personality = await createPersonality(supabase, selectedUser.user_id, {
|
||||
provider: formData.provider,
|
||||
provider: formData.provider as ModelProvider,
|
||||
title: formData.title,
|
||||
subtitle: "",
|
||||
character_prompt: formData.prompt,
|
||||
|
|
@ -206,15 +206,18 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
|
|||
const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);
|
||||
|
||||
|
||||
const previewVoice = (voiceId: string) => {
|
||||
const previewVoice = (voice: VoiceType) => {
|
||||
const { id, provider } = voice;
|
||||
|
||||
if (provider === 'openai') {
|
||||
// Stop any currently playing preview
|
||||
if (audioElement) {
|
||||
audioElement.pause();
|
||||
audioElement.currentTime = 0;
|
||||
}
|
||||
|
||||
const audioSampleUrl = `${r2UrlAudio}/${voiceId}.wav`;
|
||||
setPreviewingVoice(voiceId);
|
||||
const audioSampleUrl = `${r2UrlAudio}/${id}.wav`;
|
||||
setPreviewingVoice(id);
|
||||
|
||||
// Create and play audio element
|
||||
const audio = new Audio(audioSampleUrl);
|
||||
|
|
@ -233,11 +236,12 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
|
|||
|
||||
// Fallback in case audio doesn't trigger onended
|
||||
setTimeout(() => {
|
||||
if (previewingVoice === voiceId) {
|
||||
if (previewingVoice === id) {
|
||||
setPreviewingVoice(null);
|
||||
}
|
||||
}, 10000); // 10 second fallback
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const Heading = () => {
|
||||
return (
|
||||
|
|
@ -260,51 +264,70 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
|
|||
{currentStep === 'personality' ?
|
||||
<div className="space-y-4">
|
||||
{/* Voice Picker */}
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="voice">Pick a voice</Label>
|
||||
<p className="text-sm text-gray-500">
|
||||
Click a voice to preview how it sounds.
|
||||
</p>
|
||||
|
||||
<div className="grid grid-cols-3 gap-3">
|
||||
{voices.map((voice) => (
|
||||
<div
|
||||
key={voice.id}
|
||||
className={`
|
||||
relative rounded-lg border p-3 transition-all
|
||||
${formData.voice === voice.id
|
||||
? 'border-2 border-blue-500 shadow-sm ' + voice.color
|
||||
: 'border-gray-200 hover:border-gray-300 cursor-pointer'
|
||||
}
|
||||
`}
|
||||
onClick={() => {
|
||||
handleInputChange('voice', voice.id);
|
||||
previewVoice(voice.id);
|
||||
}}
|
||||
>
|
||||
<div className="flex flex-col">
|
||||
<div className="flex flex-col sm:flex-row items-center sm:items-start gap-3">
|
||||
<div className="text-2xl mt-0.5">
|
||||
<EmojiComponent emoji={voice.emoji} />
|
||||
</div>
|
||||
<div className="flex flex-col text-center sm:text-left">
|
||||
<span className="font-medium">{voice.name}</span>
|
||||
<span className="text-xs text-gray-600">{voice.description}</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{previewingVoice === voice.id && (
|
||||
<div className="absolute top-2 right-2">
|
||||
<div className="animate-pulse text-blue-500">
|
||||
<Volume2 size={20} />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
<div className="space-y-4">
|
||||
<Label htmlFor="voice">Pick a voice</Label>
|
||||
<p className="text-sm text-gray-500">
|
||||
Click a voice to preview how it sounds.
|
||||
</p>
|
||||
|
||||
<div className="grid grid-cols-3 gap-3 px-2">
|
||||
{[...openaiVoices, ...geminiVoices].map((voice: VoiceType) => (
|
||||
<div
|
||||
key={voice.id}
|
||||
className={`
|
||||
relative rounded-xl border-2 p-4 transition-all cursor-pointer hover:scale-[1.02] hover:shadow-lg
|
||||
${formData.voice === voice.id
|
||||
? `border-blue-500 shadow-lg ${voice.color} ring-2 ring-blue-200`
|
||||
: `border-gray-200 hover:border-gray-300 ${voice.color} hover:shadow-md`
|
||||
}
|
||||
`}
|
||||
onClick={() => {
|
||||
setFormData(prev => ({
|
||||
...prev,
|
||||
provider: voice.provider as ModelProvider,
|
||||
voice: voice.id
|
||||
}));
|
||||
previewVoice(voice);
|
||||
}}
|
||||
>
|
||||
<div className="flex flex-col">
|
||||
<div className="flex flex-col items-center gap-3">
|
||||
<div className="text-3xl">
|
||||
<EmojiComponent emoji={voice.emoji} />
|
||||
</div>
|
||||
<div className="flex flex-col text-center">
|
||||
<span className="font-semibold text-gray-900">{voice.name}</span>
|
||||
<span className="text-xs text-gray-600 mt-1">{voice.description}</span>
|
||||
<div className={`inline-flex items-center justify-center px-2 py-1 rounded-full text-xs font-medium mt-2 ${
|
||||
voice.provider === 'openai'
|
||||
? 'bg-emerald-500 text-white'
|
||||
: 'bg-purple-500 text-white'
|
||||
}`}>
|
||||
{voice.provider === 'openai' ? 'OpenAI' : 'Gemini'}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{previewingVoice === voice.id && (
|
||||
<div className="absolute top-3 right-3">
|
||||
<div className="animate-pulse text-blue-600 bg-white rounded-full p-2 shadow-lg">
|
||||
<Volume2 size={16} />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{formData.voice === voice.id && (
|
||||
<div className="absolute -top-2 -right-2">
|
||||
<div className="bg-blue-500 text-white rounded-full p-1.5 shadow-lg">
|
||||
<Check size={12} />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="title">Title</Label>
|
||||
|
|
|
|||
|
|
@ -2,16 +2,16 @@
|
|||
|
||||
import { Volume2 } from "lucide-react";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { emotionOptions, r2UrlAudio, voices } from "@/lib/data";
|
||||
import { emotionOptions, r2UrlAudio, openaiVoices } from "@/lib/data";
|
||||
import EmojiComponent from "../CreateCharacter/EmojiComponent";
|
||||
import { useState } from "react";
|
||||
import { Input } from "@/components/ui/input";
|
||||
|
||||
export const VoiceSettings = () => {
|
||||
const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);
|
||||
const [previewingVoice, setPreviewingVoice] = useState<string | null>(null);
|
||||
const [previewingVoice, setPreviewingVoice] = useState<OaiVoice | null>(null);
|
||||
|
||||
const previewVoice = (voiceId: string) => {
|
||||
const previewVoice = (voiceId: OaiVoice) => {
|
||||
// If the same voice is clicked again while playing, pause it
|
||||
if (previewingVoice === voiceId && audioElement) {
|
||||
audioElement.pause();
|
||||
|
|
@ -55,18 +55,18 @@ export const VoiceSettings = () => {
|
|||
<div className="space-y-2">
|
||||
<Label htmlFor="voice">Pick a voice</Label>
|
||||
<div className="grid grid-cols-2 gap-3">
|
||||
{voices.map((voice) => (
|
||||
{openaiVoices.map((voice) => (
|
||||
<div
|
||||
key={voice.id}
|
||||
className={`
|
||||
rounded-lg border p-3 transition-all relative
|
||||
${"" === voice.id
|
||||
${voice.id === previewingVoice
|
||||
? 'border-2 border-blue-500 shadow-sm ' + voice.color
|
||||
: 'border-gray-200 hover:border-gray-300 cursor-pointer'
|
||||
}
|
||||
`}
|
||||
onClick={() => {
|
||||
previewVoice(voice.id);
|
||||
previewVoice(voice.id as OaiVoice);
|
||||
}}
|
||||
>
|
||||
<div className="flex flex-col">
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import { useState } from "react";
|
|||
import { Drawer, DrawerContent, DrawerTrigger } from "@/components/ui/drawer";
|
||||
import { getPersonalityImageSrc } from "@/lib/utils";
|
||||
import { EmojiComponent } from "./EmojiImage";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
|
||||
interface ModifyCharacterSheetProps {
|
||||
openPersonality: IPersonality;
|
||||
|
|
@ -111,7 +112,12 @@ const ModifyCharacterSheet: React.FC<ModifyCharacterSheetProps> = ({
|
|||
/>
|
||||
</div>
|
||||
)}
|
||||
<div className="space-y-2 text-left w-full">
|
||||
<div className="space-y-2 text-left w-full relative">
|
||||
<div className="absolute top-0 right-0">
|
||||
<Badge variant="outline">
|
||||
{openPersonality.provider}
|
||||
</Badge>
|
||||
</div>
|
||||
<div className="flex flex-row items-center gap-2">
|
||||
<h3 className="text-xl font-semibold">
|
||||
{openPersonality.title}
|
||||
|
|
|
|||
|
|
@ -380,6 +380,7 @@ function App({ personalityIdState, isDoctor, userId }: AppProps) {
|
|||
sessionStatus={sessionStatus}
|
||||
onToggleConnection={onToggleConnection}
|
||||
isDoctor={isDoctor}
|
||||
personality={personality}
|
||||
/>
|
||||
</div>
|
||||
<SheetContent
|
||||
|
|
|
|||
|
|
@ -8,12 +8,14 @@ interface BottomToolbarProps {
|
|||
sessionStatus: SessionStatus;
|
||||
onToggleConnection: () => void;
|
||||
isDoctor: boolean;
|
||||
personality: IPersonality;
|
||||
}
|
||||
|
||||
function BottomToolbar({
|
||||
sessionStatus,
|
||||
onToggleConnection,
|
||||
isDoctor,
|
||||
personality,
|
||||
}: BottomToolbarProps) {
|
||||
const isConnected = sessionStatus === "CONNECTED";
|
||||
const isConnecting = sessionStatus === "CONNECTING";
|
||||
|
|
@ -36,7 +38,7 @@ function BottomToolbar({
|
|||
return "Doctor chat";
|
||||
}
|
||||
|
||||
const isDisabled = isConnecting;
|
||||
const isDisabled = isConnecting || personality.provider === "gemini";
|
||||
|
||||
function getConnectionButtonClasses() {
|
||||
const baseClasses = "text-white text-base p-2 w-fit rounded-full shadow-lg flex flex-row items-center justify-center gap-2 px-4";
|
||||
|
|
@ -72,7 +74,7 @@ function BottomToolbar({
|
|||
</TooltipTrigger>
|
||||
{isDisabled && (
|
||||
<TooltipContent>
|
||||
<p>Add an API key in Settings to chat with your AI character.</p>
|
||||
{personality.provider === "gemini" ? <p>Talk to Gemini on your Elato device.</p> : <p>Add an API key in Settings to chat with your AI character.</p>}
|
||||
</TooltipContent>
|
||||
)}
|
||||
</Tooltip>
|
||||
|
|
|
|||
|
|
@ -38,13 +38,32 @@ export const DEVICE_COST = 55;
|
|||
export const ORIGINAL_COST = 111;
|
||||
export const SUBSCRIPTION_COST = 10;
|
||||
|
||||
export const voices = [
|
||||
export type VoiceType =
|
||||
| {
|
||||
provider: "openai";
|
||||
id: OaiVoice;
|
||||
name: string;
|
||||
description: string;
|
||||
color: string;
|
||||
emoji?: string;
|
||||
}
|
||||
| {
|
||||
provider: "gemini";
|
||||
id: GeminiVoice;
|
||||
name: string;
|
||||
description: string;
|
||||
color: string;
|
||||
emoji?: string;
|
||||
};
|
||||
|
||||
export const openaiVoices: VoiceType[] = [
|
||||
{
|
||||
id: "alloy",
|
||||
name: "Alloy",
|
||||
description: "Neutral and balanced",
|
||||
color: "bg-blue-100",
|
||||
emoji: "🧑",
|
||||
provider: "openai",
|
||||
},
|
||||
{
|
||||
id: "echo",
|
||||
|
|
@ -52,6 +71,7 @@ export const voices = [
|
|||
description: "Warm and melodic",
|
||||
color: "bg-purple-100",
|
||||
emoji: "👩🎤",
|
||||
provider: "openai",
|
||||
},
|
||||
{
|
||||
id: "shimmer",
|
||||
|
|
@ -59,6 +79,7 @@ export const voices = [
|
|||
description: "Clear and bright",
|
||||
color: "bg-cyan-100",
|
||||
emoji: "👱♀️",
|
||||
provider: "openai",
|
||||
},
|
||||
{
|
||||
id: "ash",
|
||||
|
|
@ -66,6 +87,7 @@ export const voices = [
|
|||
description: "Soft and thoughtful",
|
||||
color: "bg-gray-100",
|
||||
emoji: "🧔",
|
||||
provider: "openai",
|
||||
},
|
||||
{
|
||||
id: "ballad",
|
||||
|
|
@ -73,6 +95,7 @@ export const voices = [
|
|||
description: "Melodic and emotive",
|
||||
color: "bg-indigo-100",
|
||||
emoji: "🎭",
|
||||
provider: "openai",
|
||||
},
|
||||
{
|
||||
id: "coral",
|
||||
|
|
@ -80,6 +103,7 @@ export const voices = [
|
|||
description: "Warm and friendly",
|
||||
color: "bg-orange-100",
|
||||
emoji: "👩",
|
||||
provider: "openai",
|
||||
},
|
||||
{
|
||||
id: "sage",
|
||||
|
|
@ -87,6 +111,7 @@ export const voices = [
|
|||
description: "Wise and measured",
|
||||
color: "bg-green-100",
|
||||
emoji: "🧓",
|
||||
provider: "openai",
|
||||
},
|
||||
{
|
||||
id: "verse",
|
||||
|
|
@ -94,6 +119,220 @@ export const voices = [
|
|||
description: "Poetic and expressive",
|
||||
color: "bg-rose-100",
|
||||
emoji: "👨🎨",
|
||||
provider: "openai",
|
||||
},
|
||||
];
|
||||
|
||||
export const geminiVoices: VoiceType[] = [
|
||||
{
|
||||
id: "Zephyr",
|
||||
name: "Zephyr",
|
||||
description: "Bright",
|
||||
color: "bg-yellow-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Puck",
|
||||
name: "Puck",
|
||||
description: "Upbeat",
|
||||
color: "bg-orange-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Charon",
|
||||
name: "Charon",
|
||||
description: "Informative",
|
||||
color: "bg-blue-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Kore",
|
||||
name: "Kore",
|
||||
description: "Firm",
|
||||
color: "bg-gray-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Fenrir",
|
||||
name: "Fenrir",
|
||||
description: "Excitable",
|
||||
color: "bg-red-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Leda",
|
||||
name: "Leda",
|
||||
description: "Youthful",
|
||||
color: "bg-pink-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Orus",
|
||||
name: "Orus",
|
||||
description: "Firm",
|
||||
color: "bg-slate-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Aoede",
|
||||
name: "Aoede",
|
||||
description: "Breezy",
|
||||
color: "bg-sky-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Callirrhoe",
|
||||
name: "Callirrhoe",
|
||||
description: "Easy-going",
|
||||
color: "bg-green-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Autonoe",
|
||||
name: "Autonoe",
|
||||
description: "Bright",
|
||||
color: "bg-amber-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Enceladus",
|
||||
name: "Enceladus",
|
||||
description: "Breathy",
|
||||
color: "bg-cyan-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Iapetus",
|
||||
name: "Iapetus",
|
||||
description: "Clear",
|
||||
color: "bg-white",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Umbriel",
|
||||
name: "Umbriel",
|
||||
description: "Easy-going",
|
||||
color: "bg-emerald-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Algieba",
|
||||
name: "Algieba",
|
||||
description: "Smooth",
|
||||
color: "bg-violet-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Despina",
|
||||
name: "Despina",
|
||||
description: "Smooth",
|
||||
color: "bg-purple-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Erinome",
|
||||
name: "Erinome",
|
||||
description: "Clear",
|
||||
color: "bg-neutral-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Algenib",
|
||||
name: "Algenib",
|
||||
description: "Gravelly",
|
||||
color: "bg-stone-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Rasalgethi",
|
||||
name: "Rasalgethi",
|
||||
description: "Informative",
|
||||
color: "bg-indigo-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Laomedeia",
|
||||
name: "Laomedeia",
|
||||
description: "Upbeat",
|
||||
color: "bg-lime-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Achernar",
|
||||
name: "Achernar",
|
||||
description: "Soft",
|
||||
color: "bg-rose-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Alnilam",
|
||||
name: "Alnilam",
|
||||
description: "Firm",
|
||||
color: "bg-zinc-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Schedar",
|
||||
name: "Schedar",
|
||||
description: "Even",
|
||||
color: "bg-teal-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Gacrux",
|
||||
name: "Gacrux",
|
||||
description: "Mature",
|
||||
color: "bg-brown-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Pulcherrima",
|
||||
name: "Pulcherrima",
|
||||
description: "Forward",
|
||||
color: "bg-fuchsia-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Achird",
|
||||
name: "Achird",
|
||||
description: "Friendly",
|
||||
color: "bg-yellow-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Zubenelgenubi",
|
||||
name: "Zubenelgenubi",
|
||||
description: "Casual",
|
||||
color: "bg-orange-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Vindemiatrix",
|
||||
name: "Vindemiatrix",
|
||||
description: "Gentle",
|
||||
color: "bg-green-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Sadachbia",
|
||||
name: "Sadachbia",
|
||||
description: "Lively",
|
||||
color: "bg-red-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Sadaltager",
|
||||
name: "Sadaltager",
|
||||
description: "Knowledgeable",
|
||||
color: "bg-blue-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
{
|
||||
id: "Sulafat",
|
||||
name: "Sulafat",
|
||||
description: "Warm",
|
||||
color: "bg-orange-100",
|
||||
provider: "gemini",
|
||||
},
|
||||
];
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,33 @@ const config = {
|
|||
"./src/**/*.{ts,tsx}",
|
||||
],
|
||||
prefix: "",
|
||||
safelist: [
|
||||
// Voice background colors
|
||||
"bg-blue-100",
|
||||
"bg-purple-100",
|
||||
"bg-cyan-100",
|
||||
"bg-gray-100",
|
||||
"bg-indigo-100",
|
||||
"bg-orange-100",
|
||||
"bg-green-100",
|
||||
"bg-rose-100",
|
||||
"bg-yellow-100",
|
||||
"bg-red-100",
|
||||
"bg-pink-100",
|
||||
"bg-slate-100",
|
||||
"bg-sky-100",
|
||||
"bg-amber-100",
|
||||
"bg-white",
|
||||
"bg-emerald-100",
|
||||
"bg-violet-100",
|
||||
"bg-neutral-100",
|
||||
"bg-stone-100",
|
||||
"bg-lime-100",
|
||||
"bg-zinc-100",
|
||||
"bg-teal-100",
|
||||
"bg-brown-100",
|
||||
"bg-fuchsia-100",
|
||||
],
|
||||
theme: {
|
||||
container: {
|
||||
center: true,
|
||||
|
|
@ -98,8 +125,10 @@ const config = {
|
|||
"infinite-scroll-inverse 60s linear infinite",
|
||||
},
|
||||
boxShadow: {
|
||||
cool: "0 4px 6px rgba(135, 206, 235, 0.2), 0 8px 24px rgba(70, 130, 180, 0.5)",
|
||||
tron: "0 4px 6px rgba(255, 215, 0, 0.2), 0 8px 24px rgba(218, 165, 32, 0.5)",
|
||||
cool:
|
||||
"0 4px 6px rgba(135, 206, 235, 0.2), 0 8px 24px rgba(70, 130, 180, 0.5)",
|
||||
tron:
|
||||
"0 4px 6px rgba(255, 215, 0, 0.2), 0 8px 24px rgba(218, 165, 32, 0.5)",
|
||||
custom_focus: "0 0 20px rgba(0, 0, 0, 0.25)", // Custom shadow
|
||||
custom_unfocus: "0 0 8px rgba(0, 0, 0, 0.07)", // Custom shadow
|
||||
},
|
||||
|
|
|
|||
|
|
@ -1,388 +0,0 @@
|
|||
import { Buffer } from "node:buffer";
|
||||
import { createServer } from "node:http";
|
||||
import { WebSocketServer } from "npm:ws";
|
||||
import type {
|
||||
RawData,
|
||||
WebSocket as WSWebSocket,
|
||||
WebSocketServer as _WebSocketServer,
|
||||
} from "npm:@types/ws";
|
||||
import {
|
||||
GoogleGenAI,
|
||||
LiveServerMessage,
|
||||
Modality,
|
||||
Session,
|
||||
} from "npm:@google/genai";
|
||||
import { authenticateUser } from "./utils.ts";
|
||||
import {
|
||||
getChatHistory,
|
||||
getSupabaseClient,
|
||||
updateUserSessionTime,
|
||||
} from "./supabase.ts";
|
||||
import { SupabaseClient } from "@supabase/supabase-js";
|
||||
import { Encoder } from "@evan/opus";
|
||||
|
||||
const isDev = Deno.env.get("DEV_MODE") === "True";
|
||||
|
||||
// Define your audio parameters
|
||||
const SAMPLE_RATE = 24000; // For example, 24000 Hz
|
||||
const CHANNELS = 1; // Mono (set to 2 if you have stereo)
|
||||
const FRAME_DURATION = 120; // Frame length in ms
|
||||
|
||||
const BYTES_PER_SAMPLE = 2; // 16-bit PCM: 2 bytes per sample
|
||||
const FRAME_SIZE = (SAMPLE_RATE * FRAME_DURATION / 1000) * CHANNELS *
|
||||
BYTES_PER_SAMPLE; // 960 bytes for 24000 Hz mono 16-bit
|
||||
|
||||
// Evan's library doesn’t require you to specify frame size here;
|
||||
// it will automatically handle the frame size based on your PCM input.
|
||||
// Create a global encoder instance (reuse this for every audio delta)
|
||||
const encoder = new Encoder({
|
||||
channels: CHANNELS,
|
||||
sample_rate: SAMPLE_RATE,
|
||||
application: "voip",
|
||||
});
|
||||
|
||||
encoder.expert_frame_duration = FRAME_DURATION;
|
||||
encoder.bitrate = 12000;
|
||||
|
||||
const server = createServer();
|
||||
|
||||
const wss: _WebSocketServer = new WebSocketServer({ noServer: true });
|
||||
|
||||
const supabaseUrl = Deno.env.get("SUPABASE_URL");
|
||||
const supabaseKey = Deno.env.get("SUPABASE_KEY");
|
||||
const geminiApiKey = Deno.env.get("GEMINI_API_KEY");
|
||||
|
||||
if (!supabaseUrl || !supabaseKey) {
|
||||
throw new Error("SUPABASE_URL or SUPABASE_KEY is not set");
|
||||
}
|
||||
|
||||
wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => {
|
||||
const { user, supabase } = payload;
|
||||
|
||||
let connectionPcmFile: Deno.FsFile | null = null;
|
||||
if (isDev) {
|
||||
const filename = `debug_audio_${Date.now()}.pcm`;
|
||||
connectionPcmFile = await Deno.open(filename, {
|
||||
create: true,
|
||||
write: true,
|
||||
append: true,
|
||||
});
|
||||
}
|
||||
|
||||
// Send user details to client
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
type: "auth",
|
||||
volume_control: user.device?.volume ?? 100,
|
||||
is_ota: user.device?.is_ota ?? false,
|
||||
is_reset: user.device?.is_reset ?? false,
|
||||
}),
|
||||
);
|
||||
|
||||
const isDoctor = user.user_info.user_type === "doctor";
|
||||
const chatHistory = await getChatHistory(
|
||||
supabase,
|
||||
user.user_id,
|
||||
user.personality?.key ?? null,
|
||||
isDoctor,
|
||||
);
|
||||
// const firstMessage = createFirstMessage(chatHistory, payload);
|
||||
// const systemPrompt = createSystemPrompt(chatHistory, payload);
|
||||
let sessionStartTime: number;
|
||||
|
||||
console.log(`Connecting with Gemini key "${geminiApiKey.slice(0, 3)}..."`);
|
||||
|
||||
// Initialize Google GenAI
|
||||
const ai = new GoogleGenAI({ apiKey: geminiApiKey });
|
||||
const model = "gemini-2.5-flash-preview-native-audio-dialog";
|
||||
const config = {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
systemInstruction: "You are a surfer bro talking to Kai Lenny",
|
||||
};
|
||||
|
||||
// Response queue for handling Google's callback-based responses
|
||||
const responseQueue: LiveServerMessage[] = [];
|
||||
let geminiSession: Session | null = null;
|
||||
|
||||
async function waitMessage() {
|
||||
let done = false;
|
||||
let message: LiveServerMessage | undefined = undefined;
|
||||
while (!done) {
|
||||
message = responseQueue.shift();
|
||||
if (message) {
|
||||
done = true;
|
||||
} else {
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
}
|
||||
}
|
||||
return message;
|
||||
}
|
||||
|
||||
async function handleTurn() {
|
||||
const turns: any[] = [];
|
||||
let done = false;
|
||||
while (!done) {
|
||||
const message = await waitMessage();
|
||||
turns.push(message);
|
||||
// if (
|
||||
// message.serverContent &&
|
||||
// message.serverContent.generationComplete
|
||||
// ) {
|
||||
|
||||
// }
|
||||
if (
|
||||
message.serverContent &&
|
||||
message.serverContent.generationComplete
|
||||
) {
|
||||
ws.send(JSON.stringify({
|
||||
type: "server",
|
||||
msg: "RESPONSE.CREATED",
|
||||
}));
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
return turns;
|
||||
}
|
||||
|
||||
async function processGeminiTurns() {
|
||||
try {
|
||||
console.log("Processing Gemini turns");
|
||||
while (geminiSession) {
|
||||
const turns = await handleTurn();
|
||||
|
||||
console.log("Turns:", turns);
|
||||
|
||||
// Combine all audio data from this turn
|
||||
const combinedAudio = turns.reduce(
|
||||
(acc: number[], turn: any) => {
|
||||
if (turn.data) {
|
||||
const buffer = Buffer.from(turn.data, "base64");
|
||||
const intArray = new Int16Array(
|
||||
buffer.buffer,
|
||||
buffer.byteOffset,
|
||||
buffer.byteLength /
|
||||
Int16Array.BYTES_PER_ELEMENT,
|
||||
);
|
||||
return acc.concat(Array.from(intArray));
|
||||
}
|
||||
return acc;
|
||||
},
|
||||
[],
|
||||
);
|
||||
|
||||
if (combinedAudio.length > 0) {
|
||||
console.log(
|
||||
"Received complete audio turn, length:",
|
||||
combinedAudio.length,
|
||||
);
|
||||
|
||||
// Convert back to buffer and send to client
|
||||
const audioBuffer = new Int16Array(combinedAudio);
|
||||
const buffer = Buffer.from(audioBuffer.buffer);
|
||||
|
||||
// PREVIEW AUDIO
|
||||
// const wf = new WaveFile();
|
||||
// wf.fromScratch(1, SAMPLE_RATE, "16", audioBuffer);
|
||||
|
||||
// const filename = `gemini_response_${Date.now()}.wav`;
|
||||
// await Deno.writeFile(filename, wf.toBuffer());
|
||||
// console.log(`Audio saved as ${filename}`);
|
||||
|
||||
// Send audio in chunks to client
|
||||
for (
|
||||
let offset = 0;
|
||||
offset < buffer.length;
|
||||
offset += FRAME_SIZE
|
||||
) {
|
||||
const frame = buffer.subarray(
|
||||
offset,
|
||||
offset + FRAME_SIZE,
|
||||
);
|
||||
try {
|
||||
const encodedPacket = encoder.encode(frame);
|
||||
ws.send(encodedPacket);
|
||||
} catch (_e) {
|
||||
// Skip this frame but continue with others
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// // Handle text responses if any
|
||||
// for (const turn of turns) {
|
||||
// if (turn.text) {
|
||||
// console.log("Received text:", turn.text);
|
||||
// addConversation(supabase, "assistant", turn.text, user);
|
||||
// }
|
||||
// }
|
||||
|
||||
// Send completion signal
|
||||
ws.send(JSON.stringify({
|
||||
type: "server",
|
||||
msg: "RESPONSE.COMPLETE",
|
||||
}));
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error processing Gemini turns:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to Google Gemini Live
|
||||
try {
|
||||
geminiSession = await ai.live.connect({
|
||||
model: model,
|
||||
callbacks: {
|
||||
onopen: function () {
|
||||
console.log("Gemini session opened");
|
||||
sessionStartTime = Date.now();
|
||||
},
|
||||
onmessage: function (message: LiveServerMessage) {
|
||||
console.log("Received message:", message);
|
||||
responseQueue.push(message);
|
||||
},
|
||||
onerror: function (e: any) {
|
||||
console.error("Gemini error:", e.message);
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
type: "server",
|
||||
msg: "RESPONSE.ERROR",
|
||||
}),
|
||||
);
|
||||
},
|
||||
onclose: function (e: any) {
|
||||
console.log("Gemini session closed:", e.reason);
|
||||
},
|
||||
},
|
||||
config: config,
|
||||
});
|
||||
|
||||
console.log("Connected to Gemini successfully!");
|
||||
// Send first message if available
|
||||
const inputTurns = [{
|
||||
role: "user",
|
||||
parts: [{ text: "Hello how are you?" }],
|
||||
}];
|
||||
geminiSession?.sendClientContent({ turns: inputTurns });
|
||||
processGeminiTurns();
|
||||
} catch (e: unknown) {
|
||||
console.log(`Error connecting to Gemini: ${e}`);
|
||||
ws.close();
|
||||
return;
|
||||
}
|
||||
|
||||
ws.on("message", (data: any, isBinary: boolean) => {
|
||||
try {
|
||||
if (isBinary) {
|
||||
// Handle binary audio data from ESP32
|
||||
const base64Data = data.toString("base64");
|
||||
|
||||
if (isDev && connectionPcmFile) {
|
||||
connectionPcmFile.write(data);
|
||||
}
|
||||
|
||||
// Send audio to Gemini
|
||||
geminiSession?.sendRealtimeInput({
|
||||
audio: {
|
||||
data: base64Data,
|
||||
mimeType: "audio/pcm;rate=24000", // Gemini expects 16kHz but 24kHz is fine
|
||||
},
|
||||
});
|
||||
} else {
|
||||
// Handle text/JSON messages
|
||||
const message = JSON.parse(data.toString("utf-8"));
|
||||
|
||||
if (
|
||||
message.type === "instruction" &&
|
||||
message.msg === "end_of_speech"
|
||||
) {
|
||||
console.log("end_of_speech detected");
|
||||
// Gemini handles turn detection automatically, but we can send a signal
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
type: "server",
|
||||
msg: "AUDIO.COMMITTED",
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
if (
|
||||
message.type === "instruction" &&
|
||||
message.msg === "INTERRUPT"
|
||||
) {
|
||||
console.log("interrupt detected");
|
||||
// For Gemini, we might need to close and reopen the session or handle differently
|
||||
// This depends on Gemini's interrupt capabilities
|
||||
}
|
||||
}
|
||||
} catch (e: unknown) {
|
||||
console.error("Error handling message:", (e as Error).message);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on("error", (error: any) => {
|
||||
console.error("WebSocket error:", error);
|
||||
geminiSession?.close();
|
||||
});
|
||||
|
||||
ws.on("close", async (code: number, reason: string) => {
|
||||
console.log(`WebSocket closed with code ${code}, reason: ${reason}`);
|
||||
if (sessionStartTime) {
|
||||
const sessionDuration = Math.floor(
|
||||
(Date.now() - sessionStartTime) / 1000,
|
||||
);
|
||||
await updateUserSessionTime(supabase, user, sessionDuration);
|
||||
}
|
||||
geminiSession?.close();
|
||||
if (isDev && connectionPcmFile) {
|
||||
connectionPcmFile.close();
|
||||
console.log("Closed debug audio file.");
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
server.on("upgrade", async (req, socket, head) => {
|
||||
console.log("upgrade");
|
||||
let user: IUser;
|
||||
let supabase: SupabaseClient;
|
||||
let authToken: string;
|
||||
try {
|
||||
const { authorization: authHeader, "x-wifi-rssi": rssi } = req.headers;
|
||||
authToken = authHeader?.replace("Bearer ", "") ?? "";
|
||||
const wifiStrength = parseInt(rssi as string); // Convert to number
|
||||
|
||||
// You can now use wifiStrength in your code
|
||||
console.log("WiFi RSSI:", wifiStrength); // Will log something like -50
|
||||
|
||||
// Remove debug logging
|
||||
if (!authToken) {
|
||||
socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
|
||||
socket.destroy();
|
||||
return;
|
||||
}
|
||||
|
||||
supabase = getSupabaseClient(authToken as string);
|
||||
user = await authenticateUser(supabase, authToken as string);
|
||||
console.log(user.email);
|
||||
} catch (_e: any) {
|
||||
socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
|
||||
socket.destroy();
|
||||
return;
|
||||
}
|
||||
|
||||
wss.handleUpgrade(req, socket, head, (ws) => {
|
||||
wss.emit("connection", ws, {
|
||||
user,
|
||||
supabase,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
if (isDev) { // deno run -A --env-file=.env main.ts
|
||||
const HOST = Deno.env.get("HOST") || "0.0.0.0";
|
||||
const PORT = Deno.env.get("PORT") || "8000";
|
||||
server.listen(Number(PORT), HOST, () => {
|
||||
console.log(`Audio capture server running on ws://${HOST}:${PORT}`);
|
||||
});
|
||||
} else {
|
||||
server.listen(8080);
|
||||
}
|
||||
|
|
@ -5,7 +5,12 @@ import type {
|
|||
WebSocketServer as _WebSocketServer,
|
||||
} from "npm:@types/ws";
|
||||
import { authenticateUser } from "./utils.ts";
|
||||
import { getSupabaseClient } from "./supabase.ts";
|
||||
import {
|
||||
createFirstMessage,
|
||||
createSystemPrompt,
|
||||
getChatHistory,
|
||||
getSupabaseClient,
|
||||
} from "./supabase.ts";
|
||||
import { SupabaseClient } from "@supabase/supabase-js";
|
||||
import { isDev } from "./utils.ts";
|
||||
import { connectToOpenAI } from "./models/openai.ts";
|
||||
|
|
@ -16,8 +21,50 @@ const server = createServer();
|
|||
const wss: _WebSocketServer = new WebSocketServer({ noServer: true });
|
||||
|
||||
wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => {
|
||||
// await connectToOpenAI(ws, payload);
|
||||
await connectToGemini(ws, payload);
|
||||
const { user, supabase } = payload;
|
||||
|
||||
let connectionPcmFile: Deno.FsFile | null = null;
|
||||
if (isDev) {
|
||||
const filename = `debug_audio_${Date.now()}.pcm`;
|
||||
connectionPcmFile = await Deno.open(filename, {
|
||||
create: true,
|
||||
write: true,
|
||||
append: true,
|
||||
});
|
||||
}
|
||||
|
||||
const chatHistory = await getChatHistory(
|
||||
supabase,
|
||||
user.user_id,
|
||||
user.personality?.key ?? null,
|
||||
false,
|
||||
);
|
||||
const firstMessage = createFirstMessage(payload);
|
||||
const systemPrompt = createSystemPrompt(chatHistory, payload);
|
||||
|
||||
const provider = user.personality?.provider;
|
||||
switch (provider) {
|
||||
case "openai":
|
||||
await connectToOpenAI(
|
||||
ws,
|
||||
payload,
|
||||
connectionPcmFile,
|
||||
firstMessage,
|
||||
systemPrompt,
|
||||
);
|
||||
break;
|
||||
case "gemini":
|
||||
await connectToGemini(
|
||||
ws,
|
||||
payload,
|
||||
connectionPcmFile,
|
||||
firstMessage,
|
||||
systemPrompt,
|
||||
);
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unknown provider: ${provider}`);
|
||||
}
|
||||
});
|
||||
|
||||
server.on("upgrade", async (req, socket, head) => {
|
||||
|
|
|
|||
|
|
@ -1,73 +1,71 @@
|
|||
import { Buffer } from "node:buffer";
|
||||
import type { WebSocketServer as _WebSocketServer } from "npm:@types/ws";
|
||||
import {
|
||||
EndSensitivity,
|
||||
GoogleGenAI,
|
||||
LiveConnectConfig,
|
||||
LiveServerMessage,
|
||||
Modality,
|
||||
Session,
|
||||
} from "npm:@google/genai";
|
||||
import { getChatHistory, updateUserSessionTime } from "../supabase.ts";
|
||||
import {
|
||||
encoder,
|
||||
FRAME_SIZE,
|
||||
geminiApiKey,
|
||||
isDev,
|
||||
SAMPLE_RATE,
|
||||
} from "../utils.ts";
|
||||
import pkg from "npm:wavefile";
|
||||
const { WaveFile } = pkg;
|
||||
import { encoder, FRAME_SIZE, geminiApiKey, isDev } from "../utils.ts";
|
||||
import { addConversation } from "../supabase.ts";
|
||||
|
||||
export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
||||
export const connectToGemini = async (
|
||||
ws: WebSocket,
|
||||
payload: IPayload,
|
||||
connectionPcmFile: Deno.FsFile | null,
|
||||
firstMessage: string,
|
||||
systemPrompt: string,
|
||||
) => {
|
||||
const { user, supabase } = payload;
|
||||
const { oai_voice, pitch_factor } = user.personality ?? {
|
||||
oai_voice: "Sadachbia",
|
||||
provider: "gemini",
|
||||
pitch_factor: 1,
|
||||
};
|
||||
|
||||
let connectionPcmFile: Deno.FsFile | null = null;
|
||||
if (isDev) {
|
||||
const filename = `debug_audio_${Date.now()}.pcm`;
|
||||
connectionPcmFile = await Deno.open(filename, {
|
||||
create: true,
|
||||
write: true,
|
||||
append: true,
|
||||
});
|
||||
}
|
||||
const { is_ota, is_reset, volume } = user.device ?? {
|
||||
is_ota: false,
|
||||
is_reset: false,
|
||||
volume: 10,
|
||||
};
|
||||
|
||||
// Send user details to client
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
type: "auth",
|
||||
volume_control: user.device?.volume ?? 100,
|
||||
is_ota: user.device?.is_ota ?? false,
|
||||
is_reset: user.device?.is_reset ?? false,
|
||||
volume_control: volume,
|
||||
is_ota: is_ota,
|
||||
is_reset: is_reset,
|
||||
pitch_factor: pitch_factor,
|
||||
}),
|
||||
);
|
||||
|
||||
const chatHistory = await getChatHistory(
|
||||
supabase,
|
||||
user.user_id,
|
||||
user.personality?.key ?? null,
|
||||
false,
|
||||
);
|
||||
// const firstMessage = createFirstMessage(chatHistory, payload);
|
||||
// const systemPrompt = createSystemPrompt(chatHistory, payload);
|
||||
let sessionStartTime: number;
|
||||
|
||||
console.log(`Connecting with Gemini key "${geminiApiKey.slice(0, 3)}..."`);
|
||||
|
||||
// Initialize Google GenAI
|
||||
const ai = new GoogleGenAI({ apiKey: geminiApiKey });
|
||||
const model = "gemini-2.0-flash-live-001";
|
||||
const model = "gemini-2.5-flash-preview-native-audio-dialog";
|
||||
const config: LiveConnectConfig = {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
systemInstruction: "You are a surfer bro talking to Kai Lenny",
|
||||
// generationConfig: {
|
||||
// speechConfig: {
|
||||
// voiceConfig: {
|
||||
// prebuiltVoiceConfig: {
|
||||
// voiceName: "Zephyr",
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
systemInstruction: systemPrompt,
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: oai_voice,
|
||||
},
|
||||
},
|
||||
},
|
||||
realtimeInputConfig: {
|
||||
automaticActivityDetection: {
|
||||
disabled: false, // Keep VAD enabled
|
||||
endOfSpeechSensitivity: EndSensitivity.END_SENSITIVITY_HIGH, // How sensitive to detect speech ending
|
||||
silenceDurationMs: 100, // How much silence before considering speech ended
|
||||
},
|
||||
},
|
||||
outputAudioTranscription: {},
|
||||
inputAudioTranscription: {},
|
||||
};
|
||||
|
||||
// Response queue for handling Google's callback-based responses
|
||||
|
|
@ -94,21 +92,26 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
while (!done) {
|
||||
const message = await waitMessage();
|
||||
turns.push(message);
|
||||
// if (
|
||||
// message.serverContent &&
|
||||
// message.serverContent.generationComplete
|
||||
// ) {
|
||||
|
||||
// }
|
||||
if (
|
||||
message.serverContent &&
|
||||
message.serverContent.generationComplete
|
||||
message.serverContent
|
||||
) {
|
||||
ws.send(JSON.stringify({
|
||||
type: "server",
|
||||
msg: "RESPONSE.CREATED",
|
||||
}));
|
||||
done = true;
|
||||
if (message.serverContent.generationComplete) {
|
||||
ws.send(JSON.stringify({
|
||||
type: "server",
|
||||
msg: "RESPONSE.CREATED",
|
||||
}));
|
||||
done = true;
|
||||
}
|
||||
|
||||
if (message.serverContent.turnComplete) {
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
type: "server",
|
||||
msg: "AUDIO.COMMITTED",
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return turns;
|
||||
|
|
@ -120,8 +123,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
while (geminiSession) {
|
||||
const turns = await handleTurn();
|
||||
|
||||
console.log("Turns:", turns);
|
||||
|
||||
// Combine all audio data from this turn
|
||||
const combinedAudio = turns.reduce(
|
||||
(acc: number[], turn: any) => {
|
||||
|
|
@ -141,11 +142,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
);
|
||||
|
||||
if (combinedAudio.length > 0) {
|
||||
console.log(
|
||||
"Received complete audio turn, length:",
|
||||
combinedAudio.length,
|
||||
);
|
||||
|
||||
// Convert back to buffer and send to client
|
||||
const audioBuffer = new Int16Array(combinedAudio);
|
||||
const buffer = Buffer.from(audioBuffer.buffer);
|
||||
|
|
@ -179,18 +175,47 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
}
|
||||
|
||||
// // Handle text responses if any
|
||||
// for (const turn of turns) {
|
||||
// if (turn.text) {
|
||||
// console.log("Received text:", turn.text);
|
||||
// addConversation(supabase, "assistant", turn.text, user);
|
||||
// }
|
||||
// }
|
||||
let outputTranscriptionText = "";
|
||||
let inputTranscriptionText = "";
|
||||
for (const turn of turns as LiveServerMessage[]) {
|
||||
if (
|
||||
turn.serverContent &&
|
||||
turn.serverContent.outputTranscription
|
||||
) {
|
||||
outputTranscriptionText +=
|
||||
turn.serverContent.outputTranscription.text;
|
||||
}
|
||||
|
||||
if (
|
||||
turn.serverContent &&
|
||||
turn.serverContent.inputTranscription
|
||||
) {
|
||||
inputTranscriptionText +=
|
||||
turn.serverContent.inputTranscription.text;
|
||||
}
|
||||
}
|
||||
|
||||
// Send completion signal
|
||||
ws.send(JSON.stringify({
|
||||
type: "server",
|
||||
msg: "RESPONSE.COMPLETE",
|
||||
}));
|
||||
|
||||
// Add user transcription to supabase
|
||||
await addConversation(
|
||||
supabase,
|
||||
"user",
|
||||
inputTranscriptionText,
|
||||
user,
|
||||
);
|
||||
|
||||
// Add assistant transcription to supabase
|
||||
await addConversation(
|
||||
supabase,
|
||||
"assistant",
|
||||
outputTranscriptionText,
|
||||
user,
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error processing Gemini turns:", error);
|
||||
|
|
@ -204,10 +229,8 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
callbacks: {
|
||||
onopen: function () {
|
||||
console.log("Gemini session opened");
|
||||
sessionStartTime = Date.now();
|
||||
},
|
||||
onmessage: function (message: LiveServerMessage) {
|
||||
console.log("Received message:", message);
|
||||
responseQueue.push(message);
|
||||
},
|
||||
onerror: function (e: any) {
|
||||
|
|
@ -230,7 +253,7 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
// Send first message if available
|
||||
const inputTurns = [{
|
||||
role: "user",
|
||||
parts: [{ text: "Hello how are you?" }],
|
||||
parts: [{ text: firstMessage }],
|
||||
}];
|
||||
geminiSession?.sendClientContent({ turns: inputTurns });
|
||||
processGeminiTurns();
|
||||
|
|
@ -257,32 +280,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
mimeType: "audio/pcm;rate=24000", // Gemini expects 16kHz but 24kHz is fine
|
||||
},
|
||||
});
|
||||
} else {
|
||||
// Handle text/JSON messages
|
||||
const message = JSON.parse(data.toString("utf-8"));
|
||||
|
||||
if (
|
||||
message.type === "instruction" &&
|
||||
message.msg === "end_of_speech"
|
||||
) {
|
||||
console.log("end_of_speech detected");
|
||||
// Gemini handles turn detection automatically, but we can send a signal
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
type: "server",
|
||||
msg: "AUDIO.COMMITTED",
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
if (
|
||||
message.type === "instruction" &&
|
||||
message.msg === "INTERRUPT"
|
||||
) {
|
||||
console.log("interrupt detected");
|
||||
// For Gemini, we might need to close and reopen the session or handle differently
|
||||
// This depends on Gemini's interrupt capabilities
|
||||
}
|
||||
}
|
||||
} catch (e: unknown) {
|
||||
console.error("Error handling message:", (e as Error).message);
|
||||
|
|
@ -296,12 +293,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
|
|||
|
||||
ws.on("close", async (code: number, reason: string) => {
|
||||
console.log(`WebSocket closed with code ${code}, reason: ${reason}`);
|
||||
if (sessionStartTime) {
|
||||
const sessionDuration = Math.floor(
|
||||
(Date.now() - sessionStartTime) / 1000,
|
||||
);
|
||||
await updateUserSessionTime(supabase, user, sessionDuration);
|
||||
}
|
||||
geminiSession?.close();
|
||||
if (isDev && connectionPcmFile) {
|
||||
connectionPcmFile.close();
|
||||
|
|
|
|||
|
|
@ -7,14 +7,7 @@ import type {
|
|||
|
||||
import { RealtimeClient } from "../realtime/client.js";
|
||||
import { RealtimeUtils } from "../realtime/utils.js";
|
||||
import {
|
||||
addConversation,
|
||||
createFirstMessage,
|
||||
createSystemPrompt,
|
||||
getChatHistory,
|
||||
getDeviceInfo,
|
||||
updateUserSessionTime,
|
||||
} from "../supabase.ts";
|
||||
import { addConversation, getDeviceInfo } from "../supabase.ts";
|
||||
import { encoder, FRAME_SIZE, isDev, openaiApiKey } from "../utils.ts";
|
||||
|
||||
const sendFirstMessage = (client: RealtimeClient, firstMessage: string) => {
|
||||
|
|
@ -39,18 +32,15 @@ const sendFirstMessage = (client: RealtimeClient, firstMessage: string) => {
|
|||
});
|
||||
};
|
||||
|
||||
export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
|
||||
export const connectToOpenAI = async (
|
||||
ws: WebSocket,
|
||||
payload: IPayload,
|
||||
connectionPcmFile: Deno.FsFile | null,
|
||||
firstMessage: string,
|
||||
systemPrompt: string,
|
||||
) => {
|
||||
const { user, supabase } = payload;
|
||||
|
||||
let connectionPcmFile: Deno.FsFile | null = null;
|
||||
if (isDev) {
|
||||
const filename = `debug_audio_${Date.now()}.pcm`;
|
||||
connectionPcmFile = await Deno.open(filename, {
|
||||
create: true,
|
||||
write: true,
|
||||
append: true,
|
||||
});
|
||||
}
|
||||
// send user details to client
|
||||
// when DEV_MODE is true, we send the default values 100, false, false
|
||||
ws.send(
|
||||
|
|
@ -63,18 +53,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
|
|||
}),
|
||||
);
|
||||
|
||||
const isDoctor = user.user_info.user_type === "doctor";
|
||||
|
||||
const chatHistory = await getChatHistory(
|
||||
supabase,
|
||||
user.user_id,
|
||||
user.personality?.key ?? null,
|
||||
isDoctor,
|
||||
);
|
||||
const firstMessage = createFirstMessage(chatHistory, payload);
|
||||
console.log("firstMessage", firstMessage);
|
||||
const systemPrompt = createSystemPrompt(chatHistory, payload);
|
||||
let sessionStartTime: number;
|
||||
let currentItemId: string | null = null;
|
||||
let currentCallId: string | null = null;
|
||||
|
||||
|
|
@ -135,7 +113,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
|
|||
// Check if the event is session.created
|
||||
if (event.type === "session.created") {
|
||||
console.log("session created", event);
|
||||
sessionStartTime = Date.now();
|
||||
sendFirstMessage(client, firstMessage);
|
||||
} else if (event.type === "session.updated") {
|
||||
console.log("session updated", event);
|
||||
|
|
@ -361,12 +338,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
|
|||
// Add more detailed close handling
|
||||
ws.on("close", async (code: number, reason: string) => {
|
||||
console.log(`WebSocket closed with code ${code}, reason: ${reason}`);
|
||||
if (sessionStartTime) {
|
||||
const sessionDuration = Math.floor(
|
||||
(Date.now() - sessionStartTime) / 1000,
|
||||
);
|
||||
await updateUserSessionTime(supabase, user, sessionDuration);
|
||||
}
|
||||
client.disconnect();
|
||||
if (isDev) {
|
||||
if (connectionPcmFile) {
|
||||
|
|
|
|||
|
|
@ -157,7 +157,6 @@ ${chatHistory}
|
|||
`;
|
||||
|
||||
export const createFirstMessage = (
|
||||
chatHistory: IConversation[],
|
||||
payload: IPayload,
|
||||
): string => {
|
||||
const { timestamp, user } = payload;
|
||||
|
|
@ -218,21 +217,6 @@ export const addConversation = async (
|
|||
}
|
||||
};
|
||||
|
||||
export const updateUserSessionTime = async (
|
||||
supabase: SupabaseClient,
|
||||
user: IUser,
|
||||
sessionTime: number,
|
||||
): Promise<void> => {
|
||||
const { error } = await supabase
|
||||
.from("users")
|
||||
.update({
|
||||
session_time: user.session_time + sessionTime,
|
||||
})
|
||||
.eq("user_id", user.user_id);
|
||||
|
||||
if (error) throw error;
|
||||
};
|
||||
|
||||
/**
|
||||
* Get the OpenAI API Key for the user
|
||||
* @param supabase - The Supabase client
|
||||
|
|
|
|||
55
server-deno/types.d.ts
vendored
55
server-deno/types.d.ts
vendored
|
|
@ -26,21 +26,58 @@ declare global {
|
|||
user_code: string;
|
||||
}
|
||||
|
||||
type ModelProvider = "openai" | "gemini";
|
||||
|
||||
type GeminiVoice =
|
||||
| "Zephyr"
|
||||
| "Puck"
|
||||
| "Charon"
|
||||
| "Kore"
|
||||
| "Fenrir"
|
||||
| "Leda"
|
||||
| "Orus"
|
||||
| "Aoede"
|
||||
| "Callirrhoe"
|
||||
| "Autonoe"
|
||||
| "Enceladus"
|
||||
| "Iapetus"
|
||||
| "Umbriel"
|
||||
| "Algieba"
|
||||
| "Despina"
|
||||
| "Erinome"
|
||||
| "Algenib"
|
||||
| "Rasalgethi"
|
||||
| "Laomedeia"
|
||||
| "Achernar"
|
||||
| "Alnilam"
|
||||
| "Schedar"
|
||||
| "Gacrux"
|
||||
| "Pulcherrima"
|
||||
| "Achird"
|
||||
| "Zubenelgenubi"
|
||||
| "Vindemiatrix"
|
||||
| "Sadachbia"
|
||||
| "Sadaltager"
|
||||
| "Sulafat";
|
||||
|
||||
type OaiVoice =
|
||||
| "ash"
|
||||
| "alloy"
|
||||
| "echo"
|
||||
| "shimmer"
|
||||
| "ballad"
|
||||
| "coral"
|
||||
| "sage"
|
||||
| "verse";
|
||||
|
||||
interface IPersonality {
|
||||
personality_id: string;
|
||||
is_doctor: boolean;
|
||||
is_child_voice: boolean;
|
||||
is_story: boolean;
|
||||
key: string;
|
||||
oai_voice:
|
||||
| "ash"
|
||||
| "alloy"
|
||||
| "echo"
|
||||
| "shimmer"
|
||||
| "ballad"
|
||||
| "coral"
|
||||
| "sage"
|
||||
| "verse";
|
||||
oai_voice: OaiVoice | GeminiVoice;
|
||||
provider: ModelProvider;
|
||||
voice_description: string;
|
||||
title: string;
|
||||
subtitle: string;
|
||||
|
|
|
|||
12
supabase/migrations/20250611011151_add_provider.sql
Normal file
12
supabase/migrations/20250611011151_add_provider.sql
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
-- Add provider column to personalities table
|
||||
ALTER TABLE personalities
|
||||
ADD COLUMN provider TEXT CHECK (provider IN ('openai', 'gemini')) DEFAULT 'openai';
|
||||
|
||||
-- Update existing records to have a default provider
|
||||
UPDATE personalities
|
||||
SET provider = 'openai'
|
||||
WHERE provider IS NULL;
|
||||
|
||||
-- Make the column NOT NULL after setting defaults
|
||||
ALTER TABLE personalities
|
||||
ALTER COLUMN provider SET NOT NULL;
|
||||
Loading…
Reference in a new issue