diff --git a/README.md b/README.md index 3ce9f54..1dc77a9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # πŸš€ ElatoAI: Realtime Speech AI Agents for ESP32 -Realtime AI Speech powered by OpenAI Realtime API, ESP32, Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted global conversations +Realtime AI Speech powered by **OpenAI Realtime API** and **Gemini Live API**, ESP32, Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted global conversations
@@ -29,7 +29,17 @@ Realtime AI Speech powered by OpenAI Realtime API, ESP32, Secure WebSockets, and
-## ⚑️ DIY Hardware Design +## ⚑️ Realtime AI Speech Models on an ESP32 + +
+ +OpenAI Realtime API + +Gemini Live API + +
+ +## πŸ‘·β€β™€οΈ DIY Hardware Design Hardware Setup @@ -102,6 +112,7 @@ cp .env.example .env # In .env, set your environment variables # SUPABASE_KEY= # OPENAI_API_KEY= +# GEMINI_API_KEY= # Run the server at port 8000 deno run -A --env-file=.env main.ts @@ -143,13 +154,13 @@ Once your Wifi credentials are configured, turn the device off and on again and ElatoAI consists of three main components: 1. **Frontend Client** (`Next.js` hosted on Vercel) - to create and talk to your AI agents and 'send' it to your ESP32 device -2. **Edge Server Functions** (`Deno` running on Deno/Supabase Edge) - to handle the websocket connections from the ESP32 device and the OpenAI API calls -3. **ESP32 IoT Client** (`PlatformIO/Arduino`) - to receive the websocket connections from the Edge Server Functions and send audio to the OpenAI API via the Deno edge server. +2. **Edge Server Functions** (`Deno` running on Deno/Supabase Edge) - to handle the websocket connections from the ESP32 device and the OpenAI and Gemini API calls +3. **ESP32 IoT Client** (`PlatformIO/Arduino`) - to receive the websocket connections from the Edge Server Functions and send audio to the OpenAI and Gemini API via the Deno edge server. ## 🌟 Features -1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime APIs. +1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime API and Gemini's Live API. 2. **Create Custom AI Agents**: Create custom agents with different personalities and voices. 3. **Customizable Voices**: Choose from a variety of voices and personalities. 4. **Secure WebSockets**: Reliable, encrypted WebSocket communication. @@ -200,7 +211,9 @@ flowchart TD UserInput --> ESP32 ESP32[ESP32 Device] -->|WebSocket| Edge[Deno Edge Function] Edge -->|OpenAI API| OpenAI[OpenAI Realtime API] + Edge -->|Gemini API| Gemini[Gemini Live API] OpenAI --> Edge + Gemini --> Edge Edge -->|WebSocket| ESP32 ESP32 --> UserOutput ``` diff --git a/assets/gemini.png b/assets/gemini.png new file mode 100644 index 0000000..d8d7a9a Binary files /dev/null and b/assets/gemini.png differ diff --git a/assets/openai.png b/assets/openai.png new file mode 100644 index 0000000..b6eed74 Binary files /dev/null and b/assets/openai.png differ diff --git a/assets/pcb-design.png b/assets/pcb-design.png index d9a6db8..c196579 100644 Binary files a/assets/pcb-design.png and b/assets/pcb-design.png differ diff --git a/frontend-nextjs/app/api/session/route.ts b/frontend-nextjs/app/api/session/route.ts index f38c8e8..a6ddae5 100644 --- a/frontend-nextjs/app/api/session/route.ts +++ b/frontend-nextjs/app/api/session/route.ts @@ -167,6 +167,7 @@ export async function GET(request: NextRequest) { }), }, ); + console.log(response); const data = await response.json(); return NextResponse.json(data); } catch (error) { diff --git a/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx b/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx index 7bf102f..9c34ddf 100644 --- a/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx +++ b/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx @@ -14,7 +14,7 @@ import { v4 as uuidv4 } from 'uuid'; import { toast } from "@/components/ui/use-toast"; import { useRouter } from "next/navigation"; import { z } from "zod"; -import { emotionOptions, r2UrlAudio, voices } from "@/lib/data"; +import { emotionOptions, geminiVoices, openaiVoices, r2UrlAudio, VoiceType } from "@/lib/data"; import EmojiComponent from "./EmojiComponent"; import { PitchFactors } from "@/lib/utils"; import { Slider } from "@/components/ui/slider"; @@ -166,7 +166,7 @@ const SettingsDashboard: React.FC = ({ try { const personality = await createPersonality(supabase, selectedUser.user_id, { - provider: formData.provider, + provider: formData.provider as ModelProvider, title: formData.title, subtitle: "", character_prompt: formData.prompt, @@ -206,15 +206,18 @@ const SettingsDashboard: React.FC = ({ const [audioElement, setAudioElement] = useState(null); - const previewVoice = (voiceId: string) => { + const previewVoice = (voice: VoiceType) => { + const { id, provider } = voice; + + if (provider === 'openai') { // Stop any currently playing preview if (audioElement) { audioElement.pause(); audioElement.currentTime = 0; } - const audioSampleUrl = `${r2UrlAudio}/${voiceId}.wav`; - setPreviewingVoice(voiceId); + const audioSampleUrl = `${r2UrlAudio}/${id}.wav`; + setPreviewingVoice(id); // Create and play audio element const audio = new Audio(audioSampleUrl); @@ -233,11 +236,12 @@ const SettingsDashboard: React.FC = ({ // Fallback in case audio doesn't trigger onended setTimeout(() => { - if (previewingVoice === voiceId) { + if (previewingVoice === id) { setPreviewingVoice(null); } }, 10000); // 10 second fallback - }; + } + } const Heading = () => { return ( @@ -260,51 +264,70 @@ const SettingsDashboard: React.FC = ({ {currentStep === 'personality' ?
{/* Voice Picker */} -
- -

- Click a voice to preview how it sounds. -

- -
- {voices.map((voice) => ( -
{ - handleInputChange('voice', voice.id); - previewVoice(voice.id); - }} - > -
-
-
- -
-
- {voice.name} - {voice.description} -
-
- - {previewingVoice === voice.id && ( -
-
- -
-
- )} -
-
- ))} -
-
+
+ +

+Click a voice to preview how it sounds. +

+ +
+{[...openaiVoices, ...geminiVoices].map((voice: VoiceType) => ( +
{ + setFormData(prev => ({ + ...prev, + provider: voice.provider as ModelProvider, + voice: voice.id + })); + previewVoice(voice); +}} +> +
+
+
+ +
+
+ {voice.name} + {voice.description} +
+ {voice.provider === 'openai' ? 'OpenAI' : 'Gemini'} +
+
+
+ + {previewingVoice === voice.id && ( +
+
+ +
+
+ )} + + {formData.voice === voice.id && ( +
+
+ +
+
+ )} +
+
+))} +
+
diff --git a/frontend-nextjs/app/components/LandingPage/VoiceSettings.tsx b/frontend-nextjs/app/components/LandingPage/VoiceSettings.tsx index 286500c..c57358b 100644 --- a/frontend-nextjs/app/components/LandingPage/VoiceSettings.tsx +++ b/frontend-nextjs/app/components/LandingPage/VoiceSettings.tsx @@ -2,16 +2,16 @@ import { Volume2 } from "lucide-react"; import { Label } from "@/components/ui/label"; -import { emotionOptions, r2UrlAudio, voices } from "@/lib/data"; +import { emotionOptions, r2UrlAudio, openaiVoices } from "@/lib/data"; import EmojiComponent from "../CreateCharacter/EmojiComponent"; import { useState } from "react"; import { Input } from "@/components/ui/input"; export const VoiceSettings = () => { const [audioElement, setAudioElement] = useState(null); - const [previewingVoice, setPreviewingVoice] = useState(null); + const [previewingVoice, setPreviewingVoice] = useState(null); - const previewVoice = (voiceId: string) => { + const previewVoice = (voiceId: OaiVoice) => { // If the same voice is clicked again while playing, pause it if (previewingVoice === voiceId && audioElement) { audioElement.pause(); @@ -55,18 +55,18 @@ export const VoiceSettings = () => {
- {voices.map((voice) => ( + {openaiVoices.map((voice) => (
{ - previewVoice(voice.id); + previewVoice(voice.id as OaiVoice); }} >
diff --git a/frontend-nextjs/app/components/Playground/ModifyCharacterSheet.tsx b/frontend-nextjs/app/components/Playground/ModifyCharacterSheet.tsx index 0bf3ed7..d5a3545 100644 --- a/frontend-nextjs/app/components/Playground/ModifyCharacterSheet.tsx +++ b/frontend-nextjs/app/components/Playground/ModifyCharacterSheet.tsx @@ -12,6 +12,7 @@ import { useState } from "react"; import { Drawer, DrawerContent, DrawerTrigger } from "@/components/ui/drawer"; import { getPersonalityImageSrc } from "@/lib/utils"; import { EmojiComponent } from "./EmojiImage"; +import { Badge } from "@/components/ui/badge"; interface ModifyCharacterSheetProps { openPersonality: IPersonality; @@ -111,7 +112,12 @@ const ModifyCharacterSheet: React.FC = ({ />
)} -
+
+
+ + {openPersonality.provider} + +

{openPersonality.title} diff --git a/frontend-nextjs/app/components/Realtime/App.tsx b/frontend-nextjs/app/components/Realtime/App.tsx index 02df44b..1cfdf4a 100644 --- a/frontend-nextjs/app/components/Realtime/App.tsx +++ b/frontend-nextjs/app/components/Realtime/App.tsx @@ -380,6 +380,7 @@ function App({ personalityIdState, isDoctor, userId }: AppProps) { sessionStatus={sessionStatus} onToggleConnection={onToggleConnection} isDoctor={isDoctor} + personality={personality} />

void; isDoctor: boolean; + personality: IPersonality; } function BottomToolbar({ sessionStatus, onToggleConnection, isDoctor, + personality, }: BottomToolbarProps) { const isConnected = sessionStatus === "CONNECTED"; const isConnecting = sessionStatus === "CONNECTING"; @@ -36,7 +38,7 @@ function BottomToolbar({ return "Doctor chat"; } - const isDisabled = isConnecting; + const isDisabled = isConnecting || personality.provider === "gemini"; function getConnectionButtonClasses() { const baseClasses = "text-white text-base p-2 w-fit rounded-full shadow-lg flex flex-row items-center justify-center gap-2 px-4"; @@ -72,7 +74,7 @@ function BottomToolbar({ {isDisabled && ( -

Add an API key in Settings to chat with your AI character.

+ {personality.provider === "gemini" ?

Talk to Gemini on your Elato device.

:

Add an API key in Settings to chat with your AI character.

}
)} diff --git a/frontend-nextjs/lib/data.ts b/frontend-nextjs/lib/data.ts index 665fbf0..f8f5931 100644 --- a/frontend-nextjs/lib/data.ts +++ b/frontend-nextjs/lib/data.ts @@ -38,13 +38,32 @@ export const DEVICE_COST = 55; export const ORIGINAL_COST = 111; export const SUBSCRIPTION_COST = 10; -export const voices = [ +export type VoiceType = + | { + provider: "openai"; + id: OaiVoice; + name: string; + description: string; + color: string; + emoji?: string; + } + | { + provider: "gemini"; + id: GeminiVoice; + name: string; + description: string; + color: string; + emoji?: string; + }; + +export const openaiVoices: VoiceType[] = [ { id: "alloy", name: "Alloy", description: "Neutral and balanced", color: "bg-blue-100", emoji: "πŸ§‘", + provider: "openai", }, { id: "echo", @@ -52,6 +71,7 @@ export const voices = [ description: "Warm and melodic", color: "bg-purple-100", emoji: "πŸ‘©β€πŸŽ€", + provider: "openai", }, { id: "shimmer", @@ -59,6 +79,7 @@ export const voices = [ description: "Clear and bright", color: "bg-cyan-100", emoji: "πŸ‘±β€β™€οΈ", + provider: "openai", }, { id: "ash", @@ -66,6 +87,7 @@ export const voices = [ description: "Soft and thoughtful", color: "bg-gray-100", emoji: "πŸ§”", + provider: "openai", }, { id: "ballad", @@ -73,6 +95,7 @@ export const voices = [ description: "Melodic and emotive", color: "bg-indigo-100", emoji: "🎭", + provider: "openai", }, { id: "coral", @@ -80,6 +103,7 @@ export const voices = [ description: "Warm and friendly", color: "bg-orange-100", emoji: "πŸ‘©", + provider: "openai", }, { id: "sage", @@ -87,6 +111,7 @@ export const voices = [ description: "Wise and measured", color: "bg-green-100", emoji: "πŸ§“", + provider: "openai", }, { id: "verse", @@ -94,6 +119,220 @@ export const voices = [ description: "Poetic and expressive", color: "bg-rose-100", emoji: "πŸ‘¨β€πŸŽ¨", + provider: "openai", + }, +]; + +export const geminiVoices: VoiceType[] = [ + { + id: "Zephyr", + name: "Zephyr", + description: "Bright", + color: "bg-yellow-100", + provider: "gemini", + }, + { + id: "Puck", + name: "Puck", + description: "Upbeat", + color: "bg-orange-100", + provider: "gemini", + }, + { + id: "Charon", + name: "Charon", + description: "Informative", + color: "bg-blue-100", + provider: "gemini", + }, + { + id: "Kore", + name: "Kore", + description: "Firm", + color: "bg-gray-100", + provider: "gemini", + }, + { + id: "Fenrir", + name: "Fenrir", + description: "Excitable", + color: "bg-red-100", + provider: "gemini", + }, + { + id: "Leda", + name: "Leda", + description: "Youthful", + color: "bg-pink-100", + provider: "gemini", + }, + { + id: "Orus", + name: "Orus", + description: "Firm", + color: "bg-slate-100", + provider: "gemini", + }, + { + id: "Aoede", + name: "Aoede", + description: "Breezy", + color: "bg-sky-100", + provider: "gemini", + }, + { + id: "Callirrhoe", + name: "Callirrhoe", + description: "Easy-going", + color: "bg-green-100", + provider: "gemini", + }, + { + id: "Autonoe", + name: "Autonoe", + description: "Bright", + color: "bg-amber-100", + provider: "gemini", + }, + { + id: "Enceladus", + name: "Enceladus", + description: "Breathy", + color: "bg-cyan-100", + provider: "gemini", + }, + { + id: "Iapetus", + name: "Iapetus", + description: "Clear", + color: "bg-white", + provider: "gemini", + }, + { + id: "Umbriel", + name: "Umbriel", + description: "Easy-going", + color: "bg-emerald-100", + provider: "gemini", + }, + { + id: "Algieba", + name: "Algieba", + description: "Smooth", + color: "bg-violet-100", + provider: "gemini", + }, + { + id: "Despina", + name: "Despina", + description: "Smooth", + color: "bg-purple-100", + provider: "gemini", + }, + { + id: "Erinome", + name: "Erinome", + description: "Clear", + color: "bg-neutral-100", + provider: "gemini", + }, + { + id: "Algenib", + name: "Algenib", + description: "Gravelly", + color: "bg-stone-100", + provider: "gemini", + }, + { + id: "Rasalgethi", + name: "Rasalgethi", + description: "Informative", + color: "bg-indigo-100", + provider: "gemini", + }, + { + id: "Laomedeia", + name: "Laomedeia", + description: "Upbeat", + color: "bg-lime-100", + provider: "gemini", + }, + { + id: "Achernar", + name: "Achernar", + description: "Soft", + color: "bg-rose-100", + provider: "gemini", + }, + { + id: "Alnilam", + name: "Alnilam", + description: "Firm", + color: "bg-zinc-100", + provider: "gemini", + }, + { + id: "Schedar", + name: "Schedar", + description: "Even", + color: "bg-teal-100", + provider: "gemini", + }, + { + id: "Gacrux", + name: "Gacrux", + description: "Mature", + color: "bg-brown-100", + provider: "gemini", + }, + { + id: "Pulcherrima", + name: "Pulcherrima", + description: "Forward", + color: "bg-fuchsia-100", + provider: "gemini", + }, + { + id: "Achird", + name: "Achird", + description: "Friendly", + color: "bg-yellow-100", + provider: "gemini", + }, + { + id: "Zubenelgenubi", + name: "Zubenelgenubi", + description: "Casual", + color: "bg-orange-100", + provider: "gemini", + }, + { + id: "Vindemiatrix", + name: "Vindemiatrix", + description: "Gentle", + color: "bg-green-100", + provider: "gemini", + }, + { + id: "Sadachbia", + name: "Sadachbia", + description: "Lively", + color: "bg-red-100", + provider: "gemini", + }, + { + id: "Sadaltager", + name: "Sadaltager", + description: "Knowledgeable", + color: "bg-blue-100", + provider: "gemini", + }, + { + id: "Sulafat", + name: "Sulafat", + description: "Warm", + color: "bg-orange-100", + provider: "gemini", }, ]; diff --git a/frontend-nextjs/tailwind.config.ts b/frontend-nextjs/tailwind.config.ts index 1174b54..fb42b0e 100644 --- a/frontend-nextjs/tailwind.config.ts +++ b/frontend-nextjs/tailwind.config.ts @@ -10,6 +10,33 @@ const config = { "./src/**/*.{ts,tsx}", ], prefix: "", + safelist: [ + // Voice background colors + "bg-blue-100", + "bg-purple-100", + "bg-cyan-100", + "bg-gray-100", + "bg-indigo-100", + "bg-orange-100", + "bg-green-100", + "bg-rose-100", + "bg-yellow-100", + "bg-red-100", + "bg-pink-100", + "bg-slate-100", + "bg-sky-100", + "bg-amber-100", + "bg-white", + "bg-emerald-100", + "bg-violet-100", + "bg-neutral-100", + "bg-stone-100", + "bg-lime-100", + "bg-zinc-100", + "bg-teal-100", + "bg-brown-100", + "bg-fuchsia-100", + ], theme: { container: { center: true, @@ -98,8 +125,10 @@ const config = { "infinite-scroll-inverse 60s linear infinite", }, boxShadow: { - cool: "0 4px 6px rgba(135, 206, 235, 0.2), 0 8px 24px rgba(70, 130, 180, 0.5)", - tron: "0 4px 6px rgba(255, 215, 0, 0.2), 0 8px 24px rgba(218, 165, 32, 0.5)", + cool: + "0 4px 6px rgba(135, 206, 235, 0.2), 0 8px 24px rgba(70, 130, 180, 0.5)", + tron: + "0 4px 6px rgba(255, 215, 0, 0.2), 0 8px 24px rgba(218, 165, 32, 0.5)", custom_focus: "0 0 20px rgba(0, 0, 0, 0.25)", // Custom shadow custom_unfocus: "0 0 8px rgba(0, 0, 0, 0.07)", // Custom shadow }, diff --git a/server-deno/google.ts b/server-deno/google.ts deleted file mode 100644 index 6b378fb..0000000 --- a/server-deno/google.ts +++ /dev/null @@ -1,388 +0,0 @@ -import { Buffer } from "node:buffer"; -import { createServer } from "node:http"; -import { WebSocketServer } from "npm:ws"; -import type { - RawData, - WebSocket as WSWebSocket, - WebSocketServer as _WebSocketServer, -} from "npm:@types/ws"; -import { - GoogleGenAI, - LiveServerMessage, - Modality, - Session, -} from "npm:@google/genai"; -import { authenticateUser } from "./utils.ts"; -import { - getChatHistory, - getSupabaseClient, - updateUserSessionTime, -} from "./supabase.ts"; -import { SupabaseClient } from "@supabase/supabase-js"; -import { Encoder } from "@evan/opus"; - -const isDev = Deno.env.get("DEV_MODE") === "True"; - -// Define your audio parameters -const SAMPLE_RATE = 24000; // For example, 24000 Hz -const CHANNELS = 1; // Mono (set to 2 if you have stereo) -const FRAME_DURATION = 120; // Frame length in ms - -const BYTES_PER_SAMPLE = 2; // 16-bit PCM: 2 bytes per sample -const FRAME_SIZE = (SAMPLE_RATE * FRAME_DURATION / 1000) * CHANNELS * - BYTES_PER_SAMPLE; // 960 bytes for 24000 Hz mono 16-bit - -// Evan's library doesn’t require you to specify frame size here; -// it will automatically handle the frame size based on your PCM input. -// Create a global encoder instance (reuse this for every audio delta) -const encoder = new Encoder({ - channels: CHANNELS, - sample_rate: SAMPLE_RATE, - application: "voip", -}); - -encoder.expert_frame_duration = FRAME_DURATION; -encoder.bitrate = 12000; - -const server = createServer(); - -const wss: _WebSocketServer = new WebSocketServer({ noServer: true }); - -const supabaseUrl = Deno.env.get("SUPABASE_URL"); -const supabaseKey = Deno.env.get("SUPABASE_KEY"); -const geminiApiKey = Deno.env.get("GEMINI_API_KEY"); - -if (!supabaseUrl || !supabaseKey) { - throw new Error("SUPABASE_URL or SUPABASE_KEY is not set"); -} - -wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => { - const { user, supabase } = payload; - - let connectionPcmFile: Deno.FsFile | null = null; - if (isDev) { - const filename = `debug_audio_${Date.now()}.pcm`; - connectionPcmFile = await Deno.open(filename, { - create: true, - write: true, - append: true, - }); - } - - // Send user details to client - ws.send( - JSON.stringify({ - type: "auth", - volume_control: user.device?.volume ?? 100, - is_ota: user.device?.is_ota ?? false, - is_reset: user.device?.is_reset ?? false, - }), - ); - - const isDoctor = user.user_info.user_type === "doctor"; - const chatHistory = await getChatHistory( - supabase, - user.user_id, - user.personality?.key ?? null, - isDoctor, - ); - // const firstMessage = createFirstMessage(chatHistory, payload); - // const systemPrompt = createSystemPrompt(chatHistory, payload); - let sessionStartTime: number; - - console.log(`Connecting with Gemini key "${geminiApiKey.slice(0, 3)}..."`); - - // Initialize Google GenAI - const ai = new GoogleGenAI({ apiKey: geminiApiKey }); - const model = "gemini-2.5-flash-preview-native-audio-dialog"; - const config = { - responseModalities: [Modality.AUDIO], - systemInstruction: "You are a surfer bro talking to Kai Lenny", - }; - - // Response queue for handling Google's callback-based responses - const responseQueue: LiveServerMessage[] = []; - let geminiSession: Session | null = null; - - async function waitMessage() { - let done = false; - let message: LiveServerMessage | undefined = undefined; - while (!done) { - message = responseQueue.shift(); - if (message) { - done = true; - } else { - await new Promise((resolve) => setTimeout(resolve, 10)); - } - } - return message; - } - - async function handleTurn() { - const turns: any[] = []; - let done = false; - while (!done) { - const message = await waitMessage(); - turns.push(message); - // if ( - // message.serverContent && - // message.serverContent.generationComplete - // ) { - - // } - if ( - message.serverContent && - message.serverContent.generationComplete - ) { - ws.send(JSON.stringify({ - type: "server", - msg: "RESPONSE.CREATED", - })); - done = true; - } - } - return turns; - } - - async function processGeminiTurns() { - try { - console.log("Processing Gemini turns"); - while (geminiSession) { - const turns = await handleTurn(); - - console.log("Turns:", turns); - - // Combine all audio data from this turn - const combinedAudio = turns.reduce( - (acc: number[], turn: any) => { - if (turn.data) { - const buffer = Buffer.from(turn.data, "base64"); - const intArray = new Int16Array( - buffer.buffer, - buffer.byteOffset, - buffer.byteLength / - Int16Array.BYTES_PER_ELEMENT, - ); - return acc.concat(Array.from(intArray)); - } - return acc; - }, - [], - ); - - if (combinedAudio.length > 0) { - console.log( - "Received complete audio turn, length:", - combinedAudio.length, - ); - - // Convert back to buffer and send to client - const audioBuffer = new Int16Array(combinedAudio); - const buffer = Buffer.from(audioBuffer.buffer); - - // PREVIEW AUDIO - // const wf = new WaveFile(); - // wf.fromScratch(1, SAMPLE_RATE, "16", audioBuffer); - - // const filename = `gemini_response_${Date.now()}.wav`; - // await Deno.writeFile(filename, wf.toBuffer()); - // console.log(`Audio saved as ${filename}`); - - // Send audio in chunks to client - for ( - let offset = 0; - offset < buffer.length; - offset += FRAME_SIZE - ) { - const frame = buffer.subarray( - offset, - offset + FRAME_SIZE, - ); - try { - const encodedPacket = encoder.encode(frame); - ws.send(encodedPacket); - } catch (_e) { - // Skip this frame but continue with others - } - } - } - - // // Handle text responses if any - // for (const turn of turns) { - // if (turn.text) { - // console.log("Received text:", turn.text); - // addConversation(supabase, "assistant", turn.text, user); - // } - // } - - // Send completion signal - ws.send(JSON.stringify({ - type: "server", - msg: "RESPONSE.COMPLETE", - })); - } - } catch (error) { - console.error("Error processing Gemini turns:", error); - } - } - - // Connect to Google Gemini Live - try { - geminiSession = await ai.live.connect({ - model: model, - callbacks: { - onopen: function () { - console.log("Gemini session opened"); - sessionStartTime = Date.now(); - }, - onmessage: function (message: LiveServerMessage) { - console.log("Received message:", message); - responseQueue.push(message); - }, - onerror: function (e: any) { - console.error("Gemini error:", e.message); - ws.send( - JSON.stringify({ - type: "server", - msg: "RESPONSE.ERROR", - }), - ); - }, - onclose: function (e: any) { - console.log("Gemini session closed:", e.reason); - }, - }, - config: config, - }); - - console.log("Connected to Gemini successfully!"); - // Send first message if available - const inputTurns = [{ - role: "user", - parts: [{ text: "Hello how are you?" }], - }]; - geminiSession?.sendClientContent({ turns: inputTurns }); - processGeminiTurns(); - } catch (e: unknown) { - console.log(`Error connecting to Gemini: ${e}`); - ws.close(); - return; - } - - ws.on("message", (data: any, isBinary: boolean) => { - try { - if (isBinary) { - // Handle binary audio data from ESP32 - const base64Data = data.toString("base64"); - - if (isDev && connectionPcmFile) { - connectionPcmFile.write(data); - } - - // Send audio to Gemini - geminiSession?.sendRealtimeInput({ - audio: { - data: base64Data, - mimeType: "audio/pcm;rate=24000", // Gemini expects 16kHz but 24kHz is fine - }, - }); - } else { - // Handle text/JSON messages - const message = JSON.parse(data.toString("utf-8")); - - if ( - message.type === "instruction" && - message.msg === "end_of_speech" - ) { - console.log("end_of_speech detected"); - // Gemini handles turn detection automatically, but we can send a signal - ws.send( - JSON.stringify({ - type: "server", - msg: "AUDIO.COMMITTED", - }), - ); - } - - if ( - message.type === "instruction" && - message.msg === "INTERRUPT" - ) { - console.log("interrupt detected"); - // For Gemini, we might need to close and reopen the session or handle differently - // This depends on Gemini's interrupt capabilities - } - } - } catch (e: unknown) { - console.error("Error handling message:", (e as Error).message); - } - }); - - ws.on("error", (error: any) => { - console.error("WebSocket error:", error); - geminiSession?.close(); - }); - - ws.on("close", async (code: number, reason: string) => { - console.log(`WebSocket closed with code ${code}, reason: ${reason}`); - if (sessionStartTime) { - const sessionDuration = Math.floor( - (Date.now() - sessionStartTime) / 1000, - ); - await updateUserSessionTime(supabase, user, sessionDuration); - } - geminiSession?.close(); - if (isDev && connectionPcmFile) { - connectionPcmFile.close(); - console.log("Closed debug audio file."); - } - }); -}); - -server.on("upgrade", async (req, socket, head) => { - console.log("upgrade"); - let user: IUser; - let supabase: SupabaseClient; - let authToken: string; - try { - const { authorization: authHeader, "x-wifi-rssi": rssi } = req.headers; - authToken = authHeader?.replace("Bearer ", "") ?? ""; - const wifiStrength = parseInt(rssi as string); // Convert to number - - // You can now use wifiStrength in your code - console.log("WiFi RSSI:", wifiStrength); // Will log something like -50 - - // Remove debug logging - if (!authToken) { - socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n"); - socket.destroy(); - return; - } - - supabase = getSupabaseClient(authToken as string); - user = await authenticateUser(supabase, authToken as string); - console.log(user.email); - } catch (_e: any) { - socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n"); - socket.destroy(); - return; - } - - wss.handleUpgrade(req, socket, head, (ws) => { - wss.emit("connection", ws, { - user, - supabase, - timestamp: new Date().toISOString(), - }); - }); -}); - -if (isDev) { // deno run -A --env-file=.env main.ts - const HOST = Deno.env.get("HOST") || "0.0.0.0"; - const PORT = Deno.env.get("PORT") || "8000"; - server.listen(Number(PORT), HOST, () => { - console.log(`Audio capture server running on ws://${HOST}:${PORT}`); - }); -} else { - server.listen(8080); -} diff --git a/server-deno/main.ts b/server-deno/main.ts index d1c350b..4866ef2 100644 --- a/server-deno/main.ts +++ b/server-deno/main.ts @@ -5,7 +5,12 @@ import type { WebSocketServer as _WebSocketServer, } from "npm:@types/ws"; import { authenticateUser } from "./utils.ts"; -import { getSupabaseClient } from "./supabase.ts"; +import { + createFirstMessage, + createSystemPrompt, + getChatHistory, + getSupabaseClient, +} from "./supabase.ts"; import { SupabaseClient } from "@supabase/supabase-js"; import { isDev } from "./utils.ts"; import { connectToOpenAI } from "./models/openai.ts"; @@ -16,8 +21,50 @@ const server = createServer(); const wss: _WebSocketServer = new WebSocketServer({ noServer: true }); wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => { - // await connectToOpenAI(ws, payload); - await connectToGemini(ws, payload); + const { user, supabase } = payload; + + let connectionPcmFile: Deno.FsFile | null = null; + if (isDev) { + const filename = `debug_audio_${Date.now()}.pcm`; + connectionPcmFile = await Deno.open(filename, { + create: true, + write: true, + append: true, + }); + } + + const chatHistory = await getChatHistory( + supabase, + user.user_id, + user.personality?.key ?? null, + false, + ); + const firstMessage = createFirstMessage(payload); + const systemPrompt = createSystemPrompt(chatHistory, payload); + + const provider = user.personality?.provider; + switch (provider) { + case "openai": + await connectToOpenAI( + ws, + payload, + connectionPcmFile, + firstMessage, + systemPrompt, + ); + break; + case "gemini": + await connectToGemini( + ws, + payload, + connectionPcmFile, + firstMessage, + systemPrompt, + ); + break; + default: + throw new Error(`Unknown provider: ${provider}`); + } }); server.on("upgrade", async (req, socket, head) => { diff --git a/server-deno/models/gemini.ts b/server-deno/models/gemini.ts index 6f28cc8..d0d5ac1 100644 --- a/server-deno/models/gemini.ts +++ b/server-deno/models/gemini.ts @@ -1,73 +1,71 @@ import { Buffer } from "node:buffer"; import type { WebSocketServer as _WebSocketServer } from "npm:@types/ws"; import { + EndSensitivity, GoogleGenAI, LiveConnectConfig, LiveServerMessage, Modality, Session, } from "npm:@google/genai"; -import { getChatHistory, updateUserSessionTime } from "../supabase.ts"; -import { - encoder, - FRAME_SIZE, - geminiApiKey, - isDev, - SAMPLE_RATE, -} from "../utils.ts"; -import pkg from "npm:wavefile"; -const { WaveFile } = pkg; +import { encoder, FRAME_SIZE, geminiApiKey, isDev } from "../utils.ts"; +import { addConversation } from "../supabase.ts"; -export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { +export const connectToGemini = async ( + ws: WebSocket, + payload: IPayload, + connectionPcmFile: Deno.FsFile | null, + firstMessage: string, + systemPrompt: string, +) => { const { user, supabase } = payload; + const { oai_voice, pitch_factor } = user.personality ?? { + oai_voice: "Sadachbia", + provider: "gemini", + pitch_factor: 1, + }; - let connectionPcmFile: Deno.FsFile | null = null; - if (isDev) { - const filename = `debug_audio_${Date.now()}.pcm`; - connectionPcmFile = await Deno.open(filename, { - create: true, - write: true, - append: true, - }); - } + const { is_ota, is_reset, volume } = user.device ?? { + is_ota: false, + is_reset: false, + volume: 10, + }; // Send user details to client ws.send( JSON.stringify({ type: "auth", - volume_control: user.device?.volume ?? 100, - is_ota: user.device?.is_ota ?? false, - is_reset: user.device?.is_reset ?? false, + volume_control: volume, + is_ota: is_ota, + is_reset: is_reset, + pitch_factor: pitch_factor, }), ); - const chatHistory = await getChatHistory( - supabase, - user.user_id, - user.personality?.key ?? null, - false, - ); - // const firstMessage = createFirstMessage(chatHistory, payload); - // const systemPrompt = createSystemPrompt(chatHistory, payload); - let sessionStartTime: number; - console.log(`Connecting with Gemini key "${geminiApiKey.slice(0, 3)}..."`); // Initialize Google GenAI const ai = new GoogleGenAI({ apiKey: geminiApiKey }); - const model = "gemini-2.0-flash-live-001"; + const model = "gemini-2.5-flash-preview-native-audio-dialog"; const config: LiveConnectConfig = { responseModalities: [Modality.AUDIO], - systemInstruction: "You are a surfer bro talking to Kai Lenny", - // generationConfig: { - // speechConfig: { - // voiceConfig: { - // prebuiltVoiceConfig: { - // voiceName: "Zephyr", - // }, - // }, - // }, - // }, + systemInstruction: systemPrompt, + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { + voiceName: oai_voice, + }, + }, + }, + realtimeInputConfig: { + automaticActivityDetection: { + disabled: false, // Keep VAD enabled + endOfSpeechSensitivity: EndSensitivity.END_SENSITIVITY_HIGH, // How sensitive to detect speech ending + silenceDurationMs: 100, // How much silence before considering speech ended + }, + }, + outputAudioTranscription: {}, + inputAudioTranscription: {}, }; // Response queue for handling Google's callback-based responses @@ -94,21 +92,26 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { while (!done) { const message = await waitMessage(); turns.push(message); - // if ( - // message.serverContent && - // message.serverContent.generationComplete - // ) { - // } if ( - message.serverContent && - message.serverContent.generationComplete + message.serverContent ) { - ws.send(JSON.stringify({ - type: "server", - msg: "RESPONSE.CREATED", - })); - done = true; + if (message.serverContent.generationComplete) { + ws.send(JSON.stringify({ + type: "server", + msg: "RESPONSE.CREATED", + })); + done = true; + } + + if (message.serverContent.turnComplete) { + ws.send( + JSON.stringify({ + type: "server", + msg: "AUDIO.COMMITTED", + }), + ); + } } } return turns; @@ -120,8 +123,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { while (geminiSession) { const turns = await handleTurn(); - console.log("Turns:", turns); - // Combine all audio data from this turn const combinedAudio = turns.reduce( (acc: number[], turn: any) => { @@ -141,11 +142,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { ); if (combinedAudio.length > 0) { - console.log( - "Received complete audio turn, length:", - combinedAudio.length, - ); - // Convert back to buffer and send to client const audioBuffer = new Int16Array(combinedAudio); const buffer = Buffer.from(audioBuffer.buffer); @@ -179,18 +175,47 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { } // // Handle text responses if any - // for (const turn of turns) { - // if (turn.text) { - // console.log("Received text:", turn.text); - // addConversation(supabase, "assistant", turn.text, user); - // } - // } + let outputTranscriptionText = ""; + let inputTranscriptionText = ""; + for (const turn of turns as LiveServerMessage[]) { + if ( + turn.serverContent && + turn.serverContent.outputTranscription + ) { + outputTranscriptionText += + turn.serverContent.outputTranscription.text; + } + + if ( + turn.serverContent && + turn.serverContent.inputTranscription + ) { + inputTranscriptionText += + turn.serverContent.inputTranscription.text; + } + } // Send completion signal ws.send(JSON.stringify({ type: "server", msg: "RESPONSE.COMPLETE", })); + + // Add user transcription to supabase + await addConversation( + supabase, + "user", + inputTranscriptionText, + user, + ); + + // Add assistant transcription to supabase + await addConversation( + supabase, + "assistant", + outputTranscriptionText, + user, + ); } } catch (error) { console.error("Error processing Gemini turns:", error); @@ -204,10 +229,8 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { callbacks: { onopen: function () { console.log("Gemini session opened"); - sessionStartTime = Date.now(); }, onmessage: function (message: LiveServerMessage) { - console.log("Received message:", message); responseQueue.push(message); }, onerror: function (e: any) { @@ -230,7 +253,7 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { // Send first message if available const inputTurns = [{ role: "user", - parts: [{ text: "Hello how are you?" }], + parts: [{ text: firstMessage }], }]; geminiSession?.sendClientContent({ turns: inputTurns }); processGeminiTurns(); @@ -257,32 +280,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { mimeType: "audio/pcm;rate=24000", // Gemini expects 16kHz but 24kHz is fine }, }); - } else { - // Handle text/JSON messages - const message = JSON.parse(data.toString("utf-8")); - - if ( - message.type === "instruction" && - message.msg === "end_of_speech" - ) { - console.log("end_of_speech detected"); - // Gemini handles turn detection automatically, but we can send a signal - ws.send( - JSON.stringify({ - type: "server", - msg: "AUDIO.COMMITTED", - }), - ); - } - - if ( - message.type === "instruction" && - message.msg === "INTERRUPT" - ) { - console.log("interrupt detected"); - // For Gemini, we might need to close and reopen the session or handle differently - // This depends on Gemini's interrupt capabilities - } } } catch (e: unknown) { console.error("Error handling message:", (e as Error).message); @@ -296,12 +293,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => { ws.on("close", async (code: number, reason: string) => { console.log(`WebSocket closed with code ${code}, reason: ${reason}`); - if (sessionStartTime) { - const sessionDuration = Math.floor( - (Date.now() - sessionStartTime) / 1000, - ); - await updateUserSessionTime(supabase, user, sessionDuration); - } geminiSession?.close(); if (isDev && connectionPcmFile) { connectionPcmFile.close(); diff --git a/server-deno/models/openai.ts b/server-deno/models/openai.ts index d3b24dd..d7be29d 100644 --- a/server-deno/models/openai.ts +++ b/server-deno/models/openai.ts @@ -7,14 +7,7 @@ import type { import { RealtimeClient } from "../realtime/client.js"; import { RealtimeUtils } from "../realtime/utils.js"; -import { - addConversation, - createFirstMessage, - createSystemPrompt, - getChatHistory, - getDeviceInfo, - updateUserSessionTime, -} from "../supabase.ts"; +import { addConversation, getDeviceInfo } from "../supabase.ts"; import { encoder, FRAME_SIZE, isDev, openaiApiKey } from "../utils.ts"; const sendFirstMessage = (client: RealtimeClient, firstMessage: string) => { @@ -39,18 +32,15 @@ const sendFirstMessage = (client: RealtimeClient, firstMessage: string) => { }); }; -export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => { +export const connectToOpenAI = async ( + ws: WebSocket, + payload: IPayload, + connectionPcmFile: Deno.FsFile | null, + firstMessage: string, + systemPrompt: string, +) => { const { user, supabase } = payload; - let connectionPcmFile: Deno.FsFile | null = null; - if (isDev) { - const filename = `debug_audio_${Date.now()}.pcm`; - connectionPcmFile = await Deno.open(filename, { - create: true, - write: true, - append: true, - }); - } // send user details to client // when DEV_MODE is true, we send the default values 100, false, false ws.send( @@ -63,18 +53,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => { }), ); - const isDoctor = user.user_info.user_type === "doctor"; - - const chatHistory = await getChatHistory( - supabase, - user.user_id, - user.personality?.key ?? null, - isDoctor, - ); - const firstMessage = createFirstMessage(chatHistory, payload); - console.log("firstMessage", firstMessage); - const systemPrompt = createSystemPrompt(chatHistory, payload); - let sessionStartTime: number; let currentItemId: string | null = null; let currentCallId: string | null = null; @@ -135,7 +113,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => { // Check if the event is session.created if (event.type === "session.created") { console.log("session created", event); - sessionStartTime = Date.now(); sendFirstMessage(client, firstMessage); } else if (event.type === "session.updated") { console.log("session updated", event); @@ -361,12 +338,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => { // Add more detailed close handling ws.on("close", async (code: number, reason: string) => { console.log(`WebSocket closed with code ${code}, reason: ${reason}`); - if (sessionStartTime) { - const sessionDuration = Math.floor( - (Date.now() - sessionStartTime) / 1000, - ); - await updateUserSessionTime(supabase, user, sessionDuration); - } client.disconnect(); if (isDev) { if (connectionPcmFile) { diff --git a/server-deno/supabase.ts b/server-deno/supabase.ts index 1897f2c..751de52 100644 --- a/server-deno/supabase.ts +++ b/server-deno/supabase.ts @@ -157,7 +157,6 @@ ${chatHistory} `; export const createFirstMessage = ( - chatHistory: IConversation[], payload: IPayload, ): string => { const { timestamp, user } = payload; @@ -218,21 +217,6 @@ export const addConversation = async ( } }; -export const updateUserSessionTime = async ( - supabase: SupabaseClient, - user: IUser, - sessionTime: number, -): Promise => { - const { error } = await supabase - .from("users") - .update({ - session_time: user.session_time + sessionTime, - }) - .eq("user_id", user.user_id); - - if (error) throw error; -}; - /** * Get the OpenAI API Key for the user * @param supabase - The Supabase client diff --git a/server-deno/types.d.ts b/server-deno/types.d.ts index 392de57..d30c94d 100644 --- a/server-deno/types.d.ts +++ b/server-deno/types.d.ts @@ -26,21 +26,58 @@ declare global { user_code: string; } + type ModelProvider = "openai" | "gemini"; + + type GeminiVoice = + | "Zephyr" + | "Puck" + | "Charon" + | "Kore" + | "Fenrir" + | "Leda" + | "Orus" + | "Aoede" + | "Callirrhoe" + | "Autonoe" + | "Enceladus" + | "Iapetus" + | "Umbriel" + | "Algieba" + | "Despina" + | "Erinome" + | "Algenib" + | "Rasalgethi" + | "Laomedeia" + | "Achernar" + | "Alnilam" + | "Schedar" + | "Gacrux" + | "Pulcherrima" + | "Achird" + | "Zubenelgenubi" + | "Vindemiatrix" + | "Sadachbia" + | "Sadaltager" + | "Sulafat"; + + type OaiVoice = + | "ash" + | "alloy" + | "echo" + | "shimmer" + | "ballad" + | "coral" + | "sage" + | "verse"; + interface IPersonality { personality_id: string; is_doctor: boolean; is_child_voice: boolean; is_story: boolean; key: string; - oai_voice: - | "ash" - | "alloy" - | "echo" - | "shimmer" - | "ballad" - | "coral" - | "sage" - | "verse"; + oai_voice: OaiVoice | GeminiVoice; + provider: ModelProvider; voice_description: string; title: string; subtitle: string; diff --git a/supabase/migrations/20250611011151_add_provider.sql b/supabase/migrations/20250611011151_add_provider.sql new file mode 100644 index 0000000..d3f8378 --- /dev/null +++ b/supabase/migrations/20250611011151_add_provider.sql @@ -0,0 +1,12 @@ +-- Add provider column to personalities table +ALTER TABLE personalities +ADD COLUMN provider TEXT CHECK (provider IN ('openai', 'gemini')) DEFAULT 'openai'; + +-- Update existing records to have a default provider +UPDATE personalities +SET provider = 'openai' +WHERE provider IS NULL; + +-- Make the column NOT NULL after setting defaults +ALTER TABLE personalities +ALTER COLUMN provider SET NOT NULL; \ No newline at end of file