This commit is contained in:
akdeb 2025-06-11 14:57:14 +01:00
parent 852a5e0ebf
commit 1de375492c
19 changed files with 594 additions and 626 deletions

View file

@ -2,7 +2,7 @@
# 🚀 ElatoAI: Realtime Speech AI Agents for ESP32
Realtime AI Speech powered by OpenAI Realtime API, ESP32, Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted global conversations
Realtime AI Speech powered by **OpenAI Realtime API** and **Gemini Live API**, ESP32, Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted global conversations
<div align="center">
@ -29,7 +29,17 @@ Realtime AI Speech powered by OpenAI Realtime API, ESP32, Secure WebSockets, and
</div>
## ⚡️ DIY Hardware Design
## ⚡️ Realtime AI Speech Models on an ESP32
<div align="center" class="flex flex-row gap-4">
<img src="assets/openai.png" alt="OpenAI Realtime API" width="45%">
<img src="assets/gemini.png" alt="Gemini Live API" width="45%">
</div>
## 👷‍♀️ DIY Hardware Design
<img src="assets/pcb-design.png" alt="Hardware Setup" width="100%">
@ -102,6 +112,7 @@ cp .env.example .env
# In .env, set your environment variables
# SUPABASE_KEY=<your-supabase-anon-key>
# OPENAI_API_KEY=<your-openai-api-key>
# GEMINI_API_KEY=<your-gemini-api-key>
# Run the server at port 8000
deno run -A --env-file=.env main.ts
@ -143,13 +154,13 @@ Once your Wifi credentials are configured, turn the device off and on again and
ElatoAI consists of three main components:
1. **Frontend Client** (`Next.js` hosted on Vercel) - to create and talk to your AI agents and 'send' it to your ESP32 device
2. **Edge Server Functions** (`Deno` running on Deno/Supabase Edge) - to handle the websocket connections from the ESP32 device and the OpenAI API calls
3. **ESP32 IoT Client** (`PlatformIO/Arduino`) - to receive the websocket connections from the Edge Server Functions and send audio to the OpenAI API via the Deno edge server.
2. **Edge Server Functions** (`Deno` running on Deno/Supabase Edge) - to handle the websocket connections from the ESP32 device and the OpenAI and Gemini API calls
3. **ESP32 IoT Client** (`PlatformIO/Arduino`) - to receive the websocket connections from the Edge Server Functions and send audio to the OpenAI and Gemini API via the Deno edge server.
## 🌟 Features
1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime APIs.
1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime API and Gemini's Live API.
2. **Create Custom AI Agents**: Create custom agents with different personalities and voices.
3. **Customizable Voices**: Choose from a variety of voices and personalities.
4. **Secure WebSockets**: Reliable, encrypted WebSocket communication.
@ -200,7 +211,9 @@ flowchart TD
UserInput --> ESP32
ESP32[ESP32 Device] -->|WebSocket| Edge[Deno Edge Function]
Edge -->|OpenAI API| OpenAI[OpenAI Realtime API]
Edge -->|Gemini API| Gemini[Gemini Live API]
OpenAI --> Edge
Gemini --> Edge
Edge -->|WebSocket| ESP32
ESP32 --> UserOutput
```

BIN
assets/gemini.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

BIN
assets/openai.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 849 KiB

After

Width:  |  Height:  |  Size: 861 KiB

View file

@ -167,6 +167,7 @@ export async function GET(request: NextRequest) {
}),
},
);
console.log(response);
const data = await response.json();
return NextResponse.json(data);
} catch (error) {

View file

@ -14,7 +14,7 @@ import { v4 as uuidv4 } from 'uuid';
import { toast } from "@/components/ui/use-toast";
import { useRouter } from "next/navigation";
import { z } from "zod";
import { emotionOptions, r2UrlAudio, voices } from "@/lib/data";
import { emotionOptions, geminiVoices, openaiVoices, r2UrlAudio, VoiceType } from "@/lib/data";
import EmojiComponent from "./EmojiComponent";
import { PitchFactors } from "@/lib/utils";
import { Slider } from "@/components/ui/slider";
@ -166,7 +166,7 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
try {
const personality = await createPersonality(supabase, selectedUser.user_id, {
provider: formData.provider,
provider: formData.provider as ModelProvider,
title: formData.title,
subtitle: "",
character_prompt: formData.prompt,
@ -206,15 +206,18 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);
const previewVoice = (voiceId: string) => {
const previewVoice = (voice: VoiceType) => {
const { id, provider } = voice;
if (provider === 'openai') {
// Stop any currently playing preview
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
const audioSampleUrl = `${r2UrlAudio}/${voiceId}.wav`;
setPreviewingVoice(voiceId);
const audioSampleUrl = `${r2UrlAudio}/${id}.wav`;
setPreviewingVoice(id);
// Create and play audio element
const audio = new Audio(audioSampleUrl);
@ -233,11 +236,12 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
// Fallback in case audio doesn't trigger onended
setTimeout(() => {
if (previewingVoice === voiceId) {
if (previewingVoice === id) {
setPreviewingVoice(null);
}
}, 10000); // 10 second fallback
};
}
}
const Heading = () => {
return (
@ -260,51 +264,70 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
{currentStep === 'personality' ?
<div className="space-y-4">
{/* Voice Picker */}
<div className="space-y-2">
<Label htmlFor="voice">Pick a voice</Label>
<p className="text-sm text-gray-500">
Click a voice to preview how it sounds.
</p>
<div className="grid grid-cols-3 gap-3">
{voices.map((voice) => (
<div
key={voice.id}
className={`
relative rounded-lg border p-3 transition-all
${formData.voice === voice.id
? 'border-2 border-blue-500 shadow-sm ' + voice.color
: 'border-gray-200 hover:border-gray-300 cursor-pointer'
}
`}
onClick={() => {
handleInputChange('voice', voice.id);
previewVoice(voice.id);
}}
>
<div className="flex flex-col">
<div className="flex flex-col sm:flex-row items-center sm:items-start gap-3">
<div className="text-2xl mt-0.5">
<EmojiComponent emoji={voice.emoji} />
</div>
<div className="flex flex-col text-center sm:text-left">
<span className="font-medium">{voice.name}</span>
<span className="text-xs text-gray-600">{voice.description}</span>
</div>
</div>
{previewingVoice === voice.id && (
<div className="absolute top-2 right-2">
<div className="animate-pulse text-blue-500">
<Volume2 size={20} />
</div>
</div>
)}
</div>
</div>
))}
</div>
</div>
<div className="space-y-4">
<Label htmlFor="voice">Pick a voice</Label>
<p className="text-sm text-gray-500">
Click a voice to preview how it sounds.
</p>
<div className="grid grid-cols-3 gap-3 px-2">
{[...openaiVoices, ...geminiVoices].map((voice: VoiceType) => (
<div
key={voice.id}
className={`
relative rounded-xl border-2 p-4 transition-all cursor-pointer hover:scale-[1.02] hover:shadow-lg
${formData.voice === voice.id
? `border-blue-500 shadow-lg ${voice.color} ring-2 ring-blue-200`
: `border-gray-200 hover:border-gray-300 ${voice.color} hover:shadow-md`
}
`}
onClick={() => {
setFormData(prev => ({
...prev,
provider: voice.provider as ModelProvider,
voice: voice.id
}));
previewVoice(voice);
}}
>
<div className="flex flex-col">
<div className="flex flex-col items-center gap-3">
<div className="text-3xl">
<EmojiComponent emoji={voice.emoji} />
</div>
<div className="flex flex-col text-center">
<span className="font-semibold text-gray-900">{voice.name}</span>
<span className="text-xs text-gray-600 mt-1">{voice.description}</span>
<div className={`inline-flex items-center justify-center px-2 py-1 rounded-full text-xs font-medium mt-2 ${
voice.provider === 'openai'
? 'bg-emerald-500 text-white'
: 'bg-purple-500 text-white'
}`}>
{voice.provider === 'openai' ? 'OpenAI' : 'Gemini'}
</div>
</div>
</div>
{previewingVoice === voice.id && (
<div className="absolute top-3 right-3">
<div className="animate-pulse text-blue-600 bg-white rounded-full p-2 shadow-lg">
<Volume2 size={16} />
</div>
</div>
)}
{formData.voice === voice.id && (
<div className="absolute -top-2 -right-2">
<div className="bg-blue-500 text-white rounded-full p-1.5 shadow-lg">
<Check size={12} />
</div>
</div>
)}
</div>
</div>
))}
</div>
</div>
<div className="space-y-2">
<Label htmlFor="title">Title</Label>

View file

@ -2,16 +2,16 @@
import { Volume2 } from "lucide-react";
import { Label } from "@/components/ui/label";
import { emotionOptions, r2UrlAudio, voices } from "@/lib/data";
import { emotionOptions, r2UrlAudio, openaiVoices } from "@/lib/data";
import EmojiComponent from "../CreateCharacter/EmojiComponent";
import { useState } from "react";
import { Input } from "@/components/ui/input";
export const VoiceSettings = () => {
const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);
const [previewingVoice, setPreviewingVoice] = useState<string | null>(null);
const [previewingVoice, setPreviewingVoice] = useState<OaiVoice | null>(null);
const previewVoice = (voiceId: string) => {
const previewVoice = (voiceId: OaiVoice) => {
// If the same voice is clicked again while playing, pause it
if (previewingVoice === voiceId && audioElement) {
audioElement.pause();
@ -55,18 +55,18 @@ export const VoiceSettings = () => {
<div className="space-y-2">
<Label htmlFor="voice">Pick a voice</Label>
<div className="grid grid-cols-2 gap-3">
{voices.map((voice) => (
{openaiVoices.map((voice) => (
<div
key={voice.id}
className={`
rounded-lg border p-3 transition-all relative
${"" === voice.id
${voice.id === previewingVoice
? 'border-2 border-blue-500 shadow-sm ' + voice.color
: 'border-gray-200 hover:border-gray-300 cursor-pointer'
}
`}
onClick={() => {
previewVoice(voice.id);
previewVoice(voice.id as OaiVoice);
}}
>
<div className="flex flex-col">

View file

@ -12,6 +12,7 @@ import { useState } from "react";
import { Drawer, DrawerContent, DrawerTrigger } from "@/components/ui/drawer";
import { getPersonalityImageSrc } from "@/lib/utils";
import { EmojiComponent } from "./EmojiImage";
import { Badge } from "@/components/ui/badge";
interface ModifyCharacterSheetProps {
openPersonality: IPersonality;
@ -111,7 +112,12 @@ const ModifyCharacterSheet: React.FC<ModifyCharacterSheetProps> = ({
/>
</div>
)}
<div className="space-y-2 text-left w-full">
<div className="space-y-2 text-left w-full relative">
<div className="absolute top-0 right-0">
<Badge variant="outline">
{openPersonality.provider}
</Badge>
</div>
<div className="flex flex-row items-center gap-2">
<h3 className="text-xl font-semibold">
{openPersonality.title}

View file

@ -380,6 +380,7 @@ function App({ personalityIdState, isDoctor, userId }: AppProps) {
sessionStatus={sessionStatus}
onToggleConnection={onToggleConnection}
isDoctor={isDoctor}
personality={personality}
/>
</div>
<SheetContent

View file

@ -8,12 +8,14 @@ interface BottomToolbarProps {
sessionStatus: SessionStatus;
onToggleConnection: () => void;
isDoctor: boolean;
personality: IPersonality;
}
function BottomToolbar({
sessionStatus,
onToggleConnection,
isDoctor,
personality,
}: BottomToolbarProps) {
const isConnected = sessionStatus === "CONNECTED";
const isConnecting = sessionStatus === "CONNECTING";
@ -36,7 +38,7 @@ function BottomToolbar({
return "Doctor chat";
}
const isDisabled = isConnecting;
const isDisabled = isConnecting || personality.provider === "gemini";
function getConnectionButtonClasses() {
const baseClasses = "text-white text-base p-2 w-fit rounded-full shadow-lg flex flex-row items-center justify-center gap-2 px-4";
@ -72,7 +74,7 @@ function BottomToolbar({
</TooltipTrigger>
{isDisabled && (
<TooltipContent>
<p>Add an API key in Settings to chat with your AI character.</p>
{personality.provider === "gemini" ? <p>Talk to Gemini on your Elato device.</p> : <p>Add an API key in Settings to chat with your AI character.</p>}
</TooltipContent>
)}
</Tooltip>

View file

@ -38,13 +38,32 @@ export const DEVICE_COST = 55;
export const ORIGINAL_COST = 111;
export const SUBSCRIPTION_COST = 10;
export const voices = [
export type VoiceType =
| {
provider: "openai";
id: OaiVoice;
name: string;
description: string;
color: string;
emoji?: string;
}
| {
provider: "gemini";
id: GeminiVoice;
name: string;
description: string;
color: string;
emoji?: string;
};
export const openaiVoices: VoiceType[] = [
{
id: "alloy",
name: "Alloy",
description: "Neutral and balanced",
color: "bg-blue-100",
emoji: "🧑",
provider: "openai",
},
{
id: "echo",
@ -52,6 +71,7 @@ export const voices = [
description: "Warm and melodic",
color: "bg-purple-100",
emoji: "👩‍🎤",
provider: "openai",
},
{
id: "shimmer",
@ -59,6 +79,7 @@ export const voices = [
description: "Clear and bright",
color: "bg-cyan-100",
emoji: "👱‍♀️",
provider: "openai",
},
{
id: "ash",
@ -66,6 +87,7 @@ export const voices = [
description: "Soft and thoughtful",
color: "bg-gray-100",
emoji: "🧔",
provider: "openai",
},
{
id: "ballad",
@ -73,6 +95,7 @@ export const voices = [
description: "Melodic and emotive",
color: "bg-indigo-100",
emoji: "🎭",
provider: "openai",
},
{
id: "coral",
@ -80,6 +103,7 @@ export const voices = [
description: "Warm and friendly",
color: "bg-orange-100",
emoji: "👩",
provider: "openai",
},
{
id: "sage",
@ -87,6 +111,7 @@ export const voices = [
description: "Wise and measured",
color: "bg-green-100",
emoji: "🧓",
provider: "openai",
},
{
id: "verse",
@ -94,6 +119,220 @@ export const voices = [
description: "Poetic and expressive",
color: "bg-rose-100",
emoji: "👨‍🎨",
provider: "openai",
},
];
export const geminiVoices: VoiceType[] = [
{
id: "Zephyr",
name: "Zephyr",
description: "Bright",
color: "bg-yellow-100",
provider: "gemini",
},
{
id: "Puck",
name: "Puck",
description: "Upbeat",
color: "bg-orange-100",
provider: "gemini",
},
{
id: "Charon",
name: "Charon",
description: "Informative",
color: "bg-blue-100",
provider: "gemini",
},
{
id: "Kore",
name: "Kore",
description: "Firm",
color: "bg-gray-100",
provider: "gemini",
},
{
id: "Fenrir",
name: "Fenrir",
description: "Excitable",
color: "bg-red-100",
provider: "gemini",
},
{
id: "Leda",
name: "Leda",
description: "Youthful",
color: "bg-pink-100",
provider: "gemini",
},
{
id: "Orus",
name: "Orus",
description: "Firm",
color: "bg-slate-100",
provider: "gemini",
},
{
id: "Aoede",
name: "Aoede",
description: "Breezy",
color: "bg-sky-100",
provider: "gemini",
},
{
id: "Callirrhoe",
name: "Callirrhoe",
description: "Easy-going",
color: "bg-green-100",
provider: "gemini",
},
{
id: "Autonoe",
name: "Autonoe",
description: "Bright",
color: "bg-amber-100",
provider: "gemini",
},
{
id: "Enceladus",
name: "Enceladus",
description: "Breathy",
color: "bg-cyan-100",
provider: "gemini",
},
{
id: "Iapetus",
name: "Iapetus",
description: "Clear",
color: "bg-white",
provider: "gemini",
},
{
id: "Umbriel",
name: "Umbriel",
description: "Easy-going",
color: "bg-emerald-100",
provider: "gemini",
},
{
id: "Algieba",
name: "Algieba",
description: "Smooth",
color: "bg-violet-100",
provider: "gemini",
},
{
id: "Despina",
name: "Despina",
description: "Smooth",
color: "bg-purple-100",
provider: "gemini",
},
{
id: "Erinome",
name: "Erinome",
description: "Clear",
color: "bg-neutral-100",
provider: "gemini",
},
{
id: "Algenib",
name: "Algenib",
description: "Gravelly",
color: "bg-stone-100",
provider: "gemini",
},
{
id: "Rasalgethi",
name: "Rasalgethi",
description: "Informative",
color: "bg-indigo-100",
provider: "gemini",
},
{
id: "Laomedeia",
name: "Laomedeia",
description: "Upbeat",
color: "bg-lime-100",
provider: "gemini",
},
{
id: "Achernar",
name: "Achernar",
description: "Soft",
color: "bg-rose-100",
provider: "gemini",
},
{
id: "Alnilam",
name: "Alnilam",
description: "Firm",
color: "bg-zinc-100",
provider: "gemini",
},
{
id: "Schedar",
name: "Schedar",
description: "Even",
color: "bg-teal-100",
provider: "gemini",
},
{
id: "Gacrux",
name: "Gacrux",
description: "Mature",
color: "bg-brown-100",
provider: "gemini",
},
{
id: "Pulcherrima",
name: "Pulcherrima",
description: "Forward",
color: "bg-fuchsia-100",
provider: "gemini",
},
{
id: "Achird",
name: "Achird",
description: "Friendly",
color: "bg-yellow-100",
provider: "gemini",
},
{
id: "Zubenelgenubi",
name: "Zubenelgenubi",
description: "Casual",
color: "bg-orange-100",
provider: "gemini",
},
{
id: "Vindemiatrix",
name: "Vindemiatrix",
description: "Gentle",
color: "bg-green-100",
provider: "gemini",
},
{
id: "Sadachbia",
name: "Sadachbia",
description: "Lively",
color: "bg-red-100",
provider: "gemini",
},
{
id: "Sadaltager",
name: "Sadaltager",
description: "Knowledgeable",
color: "bg-blue-100",
provider: "gemini",
},
{
id: "Sulafat",
name: "Sulafat",
description: "Warm",
color: "bg-orange-100",
provider: "gemini",
},
];

View file

@ -10,6 +10,33 @@ const config = {
"./src/**/*.{ts,tsx}",
],
prefix: "",
safelist: [
// Voice background colors
"bg-blue-100",
"bg-purple-100",
"bg-cyan-100",
"bg-gray-100",
"bg-indigo-100",
"bg-orange-100",
"bg-green-100",
"bg-rose-100",
"bg-yellow-100",
"bg-red-100",
"bg-pink-100",
"bg-slate-100",
"bg-sky-100",
"bg-amber-100",
"bg-white",
"bg-emerald-100",
"bg-violet-100",
"bg-neutral-100",
"bg-stone-100",
"bg-lime-100",
"bg-zinc-100",
"bg-teal-100",
"bg-brown-100",
"bg-fuchsia-100",
],
theme: {
container: {
center: true,
@ -98,8 +125,10 @@ const config = {
"infinite-scroll-inverse 60s linear infinite",
},
boxShadow: {
cool: "0 4px 6px rgba(135, 206, 235, 0.2), 0 8px 24px rgba(70, 130, 180, 0.5)",
tron: "0 4px 6px rgba(255, 215, 0, 0.2), 0 8px 24px rgba(218, 165, 32, 0.5)",
cool:
"0 4px 6px rgba(135, 206, 235, 0.2), 0 8px 24px rgba(70, 130, 180, 0.5)",
tron:
"0 4px 6px rgba(255, 215, 0, 0.2), 0 8px 24px rgba(218, 165, 32, 0.5)",
custom_focus: "0 0 20px rgba(0, 0, 0, 0.25)", // Custom shadow
custom_unfocus: "0 0 8px rgba(0, 0, 0, 0.07)", // Custom shadow
},

View file

@ -1,388 +0,0 @@
import { Buffer } from "node:buffer";
import { createServer } from "node:http";
import { WebSocketServer } from "npm:ws";
import type {
RawData,
WebSocket as WSWebSocket,
WebSocketServer as _WebSocketServer,
} from "npm:@types/ws";
import {
GoogleGenAI,
LiveServerMessage,
Modality,
Session,
} from "npm:@google/genai";
import { authenticateUser } from "./utils.ts";
import {
getChatHistory,
getSupabaseClient,
updateUserSessionTime,
} from "./supabase.ts";
import { SupabaseClient } from "@supabase/supabase-js";
import { Encoder } from "@evan/opus";
const isDev = Deno.env.get("DEV_MODE") === "True";
// Define your audio parameters
const SAMPLE_RATE = 24000; // For example, 24000 Hz
const CHANNELS = 1; // Mono (set to 2 if you have stereo)
const FRAME_DURATION = 120; // Frame length in ms
const BYTES_PER_SAMPLE = 2; // 16-bit PCM: 2 bytes per sample
const FRAME_SIZE = (SAMPLE_RATE * FRAME_DURATION / 1000) * CHANNELS *
BYTES_PER_SAMPLE; // 960 bytes for 24000 Hz mono 16-bit
// Evan's library doesnt require you to specify frame size here;
// it will automatically handle the frame size based on your PCM input.
// Create a global encoder instance (reuse this for every audio delta)
const encoder = new Encoder({
channels: CHANNELS,
sample_rate: SAMPLE_RATE,
application: "voip",
});
encoder.expert_frame_duration = FRAME_DURATION;
encoder.bitrate = 12000;
const server = createServer();
const wss: _WebSocketServer = new WebSocketServer({ noServer: true });
const supabaseUrl = Deno.env.get("SUPABASE_URL");
const supabaseKey = Deno.env.get("SUPABASE_KEY");
const geminiApiKey = Deno.env.get("GEMINI_API_KEY");
if (!supabaseUrl || !supabaseKey) {
throw new Error("SUPABASE_URL or SUPABASE_KEY is not set");
}
wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => {
const { user, supabase } = payload;
let connectionPcmFile: Deno.FsFile | null = null;
if (isDev) {
const filename = `debug_audio_${Date.now()}.pcm`;
connectionPcmFile = await Deno.open(filename, {
create: true,
write: true,
append: true,
});
}
// Send user details to client
ws.send(
JSON.stringify({
type: "auth",
volume_control: user.device?.volume ?? 100,
is_ota: user.device?.is_ota ?? false,
is_reset: user.device?.is_reset ?? false,
}),
);
const isDoctor = user.user_info.user_type === "doctor";
const chatHistory = await getChatHistory(
supabase,
user.user_id,
user.personality?.key ?? null,
isDoctor,
);
// const firstMessage = createFirstMessage(chatHistory, payload);
// const systemPrompt = createSystemPrompt(chatHistory, payload);
let sessionStartTime: number;
console.log(`Connecting with Gemini key "${geminiApiKey.slice(0, 3)}..."`);
// Initialize Google GenAI
const ai = new GoogleGenAI({ apiKey: geminiApiKey });
const model = "gemini-2.5-flash-preview-native-audio-dialog";
const config = {
responseModalities: [Modality.AUDIO],
systemInstruction: "You are a surfer bro talking to Kai Lenny",
};
// Response queue for handling Google's callback-based responses
const responseQueue: LiveServerMessage[] = [];
let geminiSession: Session | null = null;
async function waitMessage() {
let done = false;
let message: LiveServerMessage | undefined = undefined;
while (!done) {
message = responseQueue.shift();
if (message) {
done = true;
} else {
await new Promise((resolve) => setTimeout(resolve, 10));
}
}
return message;
}
async function handleTurn() {
const turns: any[] = [];
let done = false;
while (!done) {
const message = await waitMessage();
turns.push(message);
// if (
// message.serverContent &&
// message.serverContent.generationComplete
// ) {
// }
if (
message.serverContent &&
message.serverContent.generationComplete
) {
ws.send(JSON.stringify({
type: "server",
msg: "RESPONSE.CREATED",
}));
done = true;
}
}
return turns;
}
async function processGeminiTurns() {
try {
console.log("Processing Gemini turns");
while (geminiSession) {
const turns = await handleTurn();
console.log("Turns:", turns);
// Combine all audio data from this turn
const combinedAudio = turns.reduce(
(acc: number[], turn: any) => {
if (turn.data) {
const buffer = Buffer.from(turn.data, "base64");
const intArray = new Int16Array(
buffer.buffer,
buffer.byteOffset,
buffer.byteLength /
Int16Array.BYTES_PER_ELEMENT,
);
return acc.concat(Array.from(intArray));
}
return acc;
},
[],
);
if (combinedAudio.length > 0) {
console.log(
"Received complete audio turn, length:",
combinedAudio.length,
);
// Convert back to buffer and send to client
const audioBuffer = new Int16Array(combinedAudio);
const buffer = Buffer.from(audioBuffer.buffer);
// PREVIEW AUDIO
// const wf = new WaveFile();
// wf.fromScratch(1, SAMPLE_RATE, "16", audioBuffer);
// const filename = `gemini_response_${Date.now()}.wav`;
// await Deno.writeFile(filename, wf.toBuffer());
// console.log(`Audio saved as ${filename}`);
// Send audio in chunks to client
for (
let offset = 0;
offset < buffer.length;
offset += FRAME_SIZE
) {
const frame = buffer.subarray(
offset,
offset + FRAME_SIZE,
);
try {
const encodedPacket = encoder.encode(frame);
ws.send(encodedPacket);
} catch (_e) {
// Skip this frame but continue with others
}
}
}
// // Handle text responses if any
// for (const turn of turns) {
// if (turn.text) {
// console.log("Received text:", turn.text);
// addConversation(supabase, "assistant", turn.text, user);
// }
// }
// Send completion signal
ws.send(JSON.stringify({
type: "server",
msg: "RESPONSE.COMPLETE",
}));
}
} catch (error) {
console.error("Error processing Gemini turns:", error);
}
}
// Connect to Google Gemini Live
try {
geminiSession = await ai.live.connect({
model: model,
callbacks: {
onopen: function () {
console.log("Gemini session opened");
sessionStartTime = Date.now();
},
onmessage: function (message: LiveServerMessage) {
console.log("Received message:", message);
responseQueue.push(message);
},
onerror: function (e: any) {
console.error("Gemini error:", e.message);
ws.send(
JSON.stringify({
type: "server",
msg: "RESPONSE.ERROR",
}),
);
},
onclose: function (e: any) {
console.log("Gemini session closed:", e.reason);
},
},
config: config,
});
console.log("Connected to Gemini successfully!");
// Send first message if available
const inputTurns = [{
role: "user",
parts: [{ text: "Hello how are you?" }],
}];
geminiSession?.sendClientContent({ turns: inputTurns });
processGeminiTurns();
} catch (e: unknown) {
console.log(`Error connecting to Gemini: ${e}`);
ws.close();
return;
}
ws.on("message", (data: any, isBinary: boolean) => {
try {
if (isBinary) {
// Handle binary audio data from ESP32
const base64Data = data.toString("base64");
if (isDev && connectionPcmFile) {
connectionPcmFile.write(data);
}
// Send audio to Gemini
geminiSession?.sendRealtimeInput({
audio: {
data: base64Data,
mimeType: "audio/pcm;rate=24000", // Gemini expects 16kHz but 24kHz is fine
},
});
} else {
// Handle text/JSON messages
const message = JSON.parse(data.toString("utf-8"));
if (
message.type === "instruction" &&
message.msg === "end_of_speech"
) {
console.log("end_of_speech detected");
// Gemini handles turn detection automatically, but we can send a signal
ws.send(
JSON.stringify({
type: "server",
msg: "AUDIO.COMMITTED",
}),
);
}
if (
message.type === "instruction" &&
message.msg === "INTERRUPT"
) {
console.log("interrupt detected");
// For Gemini, we might need to close and reopen the session or handle differently
// This depends on Gemini's interrupt capabilities
}
}
} catch (e: unknown) {
console.error("Error handling message:", (e as Error).message);
}
});
ws.on("error", (error: any) => {
console.error("WebSocket error:", error);
geminiSession?.close();
});
ws.on("close", async (code: number, reason: string) => {
console.log(`WebSocket closed with code ${code}, reason: ${reason}`);
if (sessionStartTime) {
const sessionDuration = Math.floor(
(Date.now() - sessionStartTime) / 1000,
);
await updateUserSessionTime(supabase, user, sessionDuration);
}
geminiSession?.close();
if (isDev && connectionPcmFile) {
connectionPcmFile.close();
console.log("Closed debug audio file.");
}
});
});
server.on("upgrade", async (req, socket, head) => {
console.log("upgrade");
let user: IUser;
let supabase: SupabaseClient;
let authToken: string;
try {
const { authorization: authHeader, "x-wifi-rssi": rssi } = req.headers;
authToken = authHeader?.replace("Bearer ", "") ?? "";
const wifiStrength = parseInt(rssi as string); // Convert to number
// You can now use wifiStrength in your code
console.log("WiFi RSSI:", wifiStrength); // Will log something like -50
// Remove debug logging
if (!authToken) {
socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
socket.destroy();
return;
}
supabase = getSupabaseClient(authToken as string);
user = await authenticateUser(supabase, authToken as string);
console.log(user.email);
} catch (_e: any) {
socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
socket.destroy();
return;
}
wss.handleUpgrade(req, socket, head, (ws) => {
wss.emit("connection", ws, {
user,
supabase,
timestamp: new Date().toISOString(),
});
});
});
if (isDev) { // deno run -A --env-file=.env main.ts
const HOST = Deno.env.get("HOST") || "0.0.0.0";
const PORT = Deno.env.get("PORT") || "8000";
server.listen(Number(PORT), HOST, () => {
console.log(`Audio capture server running on ws://${HOST}:${PORT}`);
});
} else {
server.listen(8080);
}

View file

@ -5,7 +5,12 @@ import type {
WebSocketServer as _WebSocketServer,
} from "npm:@types/ws";
import { authenticateUser } from "./utils.ts";
import { getSupabaseClient } from "./supabase.ts";
import {
createFirstMessage,
createSystemPrompt,
getChatHistory,
getSupabaseClient,
} from "./supabase.ts";
import { SupabaseClient } from "@supabase/supabase-js";
import { isDev } from "./utils.ts";
import { connectToOpenAI } from "./models/openai.ts";
@ -16,8 +21,50 @@ const server = createServer();
const wss: _WebSocketServer = new WebSocketServer({ noServer: true });
wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => {
// await connectToOpenAI(ws, payload);
await connectToGemini(ws, payload);
const { user, supabase } = payload;
let connectionPcmFile: Deno.FsFile | null = null;
if (isDev) {
const filename = `debug_audio_${Date.now()}.pcm`;
connectionPcmFile = await Deno.open(filename, {
create: true,
write: true,
append: true,
});
}
const chatHistory = await getChatHistory(
supabase,
user.user_id,
user.personality?.key ?? null,
false,
);
const firstMessage = createFirstMessage(payload);
const systemPrompt = createSystemPrompt(chatHistory, payload);
const provider = user.personality?.provider;
switch (provider) {
case "openai":
await connectToOpenAI(
ws,
payload,
connectionPcmFile,
firstMessage,
systemPrompt,
);
break;
case "gemini":
await connectToGemini(
ws,
payload,
connectionPcmFile,
firstMessage,
systemPrompt,
);
break;
default:
throw new Error(`Unknown provider: ${provider}`);
}
});
server.on("upgrade", async (req, socket, head) => {

View file

@ -1,73 +1,71 @@
import { Buffer } from "node:buffer";
import type { WebSocketServer as _WebSocketServer } from "npm:@types/ws";
import {
EndSensitivity,
GoogleGenAI,
LiveConnectConfig,
LiveServerMessage,
Modality,
Session,
} from "npm:@google/genai";
import { getChatHistory, updateUserSessionTime } from "../supabase.ts";
import {
encoder,
FRAME_SIZE,
geminiApiKey,
isDev,
SAMPLE_RATE,
} from "../utils.ts";
import pkg from "npm:wavefile";
const { WaveFile } = pkg;
import { encoder, FRAME_SIZE, geminiApiKey, isDev } from "../utils.ts";
import { addConversation } from "../supabase.ts";
export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
export const connectToGemini = async (
ws: WebSocket,
payload: IPayload,
connectionPcmFile: Deno.FsFile | null,
firstMessage: string,
systemPrompt: string,
) => {
const { user, supabase } = payload;
const { oai_voice, pitch_factor } = user.personality ?? {
oai_voice: "Sadachbia",
provider: "gemini",
pitch_factor: 1,
};
let connectionPcmFile: Deno.FsFile | null = null;
if (isDev) {
const filename = `debug_audio_${Date.now()}.pcm`;
connectionPcmFile = await Deno.open(filename, {
create: true,
write: true,
append: true,
});
}
const { is_ota, is_reset, volume } = user.device ?? {
is_ota: false,
is_reset: false,
volume: 10,
};
// Send user details to client
ws.send(
JSON.stringify({
type: "auth",
volume_control: user.device?.volume ?? 100,
is_ota: user.device?.is_ota ?? false,
is_reset: user.device?.is_reset ?? false,
volume_control: volume,
is_ota: is_ota,
is_reset: is_reset,
pitch_factor: pitch_factor,
}),
);
const chatHistory = await getChatHistory(
supabase,
user.user_id,
user.personality?.key ?? null,
false,
);
// const firstMessage = createFirstMessage(chatHistory, payload);
// const systemPrompt = createSystemPrompt(chatHistory, payload);
let sessionStartTime: number;
console.log(`Connecting with Gemini key "${geminiApiKey.slice(0, 3)}..."`);
// Initialize Google GenAI
const ai = new GoogleGenAI({ apiKey: geminiApiKey });
const model = "gemini-2.0-flash-live-001";
const model = "gemini-2.5-flash-preview-native-audio-dialog";
const config: LiveConnectConfig = {
responseModalities: [Modality.AUDIO],
systemInstruction: "You are a surfer bro talking to Kai Lenny",
// generationConfig: {
// speechConfig: {
// voiceConfig: {
// prebuiltVoiceConfig: {
// voiceName: "Zephyr",
// },
// },
// },
// },
systemInstruction: systemPrompt,
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: oai_voice,
},
},
},
realtimeInputConfig: {
automaticActivityDetection: {
disabled: false, // Keep VAD enabled
endOfSpeechSensitivity: EndSensitivity.END_SENSITIVITY_HIGH, // How sensitive to detect speech ending
silenceDurationMs: 100, // How much silence before considering speech ended
},
},
outputAudioTranscription: {},
inputAudioTranscription: {},
};
// Response queue for handling Google's callback-based responses
@ -94,21 +92,26 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
while (!done) {
const message = await waitMessage();
turns.push(message);
// if (
// message.serverContent &&
// message.serverContent.generationComplete
// ) {
// }
if (
message.serverContent &&
message.serverContent.generationComplete
message.serverContent
) {
ws.send(JSON.stringify({
type: "server",
msg: "RESPONSE.CREATED",
}));
done = true;
if (message.serverContent.generationComplete) {
ws.send(JSON.stringify({
type: "server",
msg: "RESPONSE.CREATED",
}));
done = true;
}
if (message.serverContent.turnComplete) {
ws.send(
JSON.stringify({
type: "server",
msg: "AUDIO.COMMITTED",
}),
);
}
}
}
return turns;
@ -120,8 +123,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
while (geminiSession) {
const turns = await handleTurn();
console.log("Turns:", turns);
// Combine all audio data from this turn
const combinedAudio = turns.reduce(
(acc: number[], turn: any) => {
@ -141,11 +142,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
);
if (combinedAudio.length > 0) {
console.log(
"Received complete audio turn, length:",
combinedAudio.length,
);
// Convert back to buffer and send to client
const audioBuffer = new Int16Array(combinedAudio);
const buffer = Buffer.from(audioBuffer.buffer);
@ -179,18 +175,47 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
}
// // Handle text responses if any
// for (const turn of turns) {
// if (turn.text) {
// console.log("Received text:", turn.text);
// addConversation(supabase, "assistant", turn.text, user);
// }
// }
let outputTranscriptionText = "";
let inputTranscriptionText = "";
for (const turn of turns as LiveServerMessage[]) {
if (
turn.serverContent &&
turn.serverContent.outputTranscription
) {
outputTranscriptionText +=
turn.serverContent.outputTranscription.text;
}
if (
turn.serverContent &&
turn.serverContent.inputTranscription
) {
inputTranscriptionText +=
turn.serverContent.inputTranscription.text;
}
}
// Send completion signal
ws.send(JSON.stringify({
type: "server",
msg: "RESPONSE.COMPLETE",
}));
// Add user transcription to supabase
await addConversation(
supabase,
"user",
inputTranscriptionText,
user,
);
// Add assistant transcription to supabase
await addConversation(
supabase,
"assistant",
outputTranscriptionText,
user,
);
}
} catch (error) {
console.error("Error processing Gemini turns:", error);
@ -204,10 +229,8 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
callbacks: {
onopen: function () {
console.log("Gemini session opened");
sessionStartTime = Date.now();
},
onmessage: function (message: LiveServerMessage) {
console.log("Received message:", message);
responseQueue.push(message);
},
onerror: function (e: any) {
@ -230,7 +253,7 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
// Send first message if available
const inputTurns = [{
role: "user",
parts: [{ text: "Hello how are you?" }],
parts: [{ text: firstMessage }],
}];
geminiSession?.sendClientContent({ turns: inputTurns });
processGeminiTurns();
@ -257,32 +280,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
mimeType: "audio/pcm;rate=24000", // Gemini expects 16kHz but 24kHz is fine
},
});
} else {
// Handle text/JSON messages
const message = JSON.parse(data.toString("utf-8"));
if (
message.type === "instruction" &&
message.msg === "end_of_speech"
) {
console.log("end_of_speech detected");
// Gemini handles turn detection automatically, but we can send a signal
ws.send(
JSON.stringify({
type: "server",
msg: "AUDIO.COMMITTED",
}),
);
}
if (
message.type === "instruction" &&
message.msg === "INTERRUPT"
) {
console.log("interrupt detected");
// For Gemini, we might need to close and reopen the session or handle differently
// This depends on Gemini's interrupt capabilities
}
}
} catch (e: unknown) {
console.error("Error handling message:", (e as Error).message);
@ -296,12 +293,6 @@ export const connectToGemini = async (ws: WebSocket, payload: IPayload) => {
ws.on("close", async (code: number, reason: string) => {
console.log(`WebSocket closed with code ${code}, reason: ${reason}`);
if (sessionStartTime) {
const sessionDuration = Math.floor(
(Date.now() - sessionStartTime) / 1000,
);
await updateUserSessionTime(supabase, user, sessionDuration);
}
geminiSession?.close();
if (isDev && connectionPcmFile) {
connectionPcmFile.close();

View file

@ -7,14 +7,7 @@ import type {
import { RealtimeClient } from "../realtime/client.js";
import { RealtimeUtils } from "../realtime/utils.js";
import {
addConversation,
createFirstMessage,
createSystemPrompt,
getChatHistory,
getDeviceInfo,
updateUserSessionTime,
} from "../supabase.ts";
import { addConversation, getDeviceInfo } from "../supabase.ts";
import { encoder, FRAME_SIZE, isDev, openaiApiKey } from "../utils.ts";
const sendFirstMessage = (client: RealtimeClient, firstMessage: string) => {
@ -39,18 +32,15 @@ const sendFirstMessage = (client: RealtimeClient, firstMessage: string) => {
});
};
export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
export const connectToOpenAI = async (
ws: WebSocket,
payload: IPayload,
connectionPcmFile: Deno.FsFile | null,
firstMessage: string,
systemPrompt: string,
) => {
const { user, supabase } = payload;
let connectionPcmFile: Deno.FsFile | null = null;
if (isDev) {
const filename = `debug_audio_${Date.now()}.pcm`;
connectionPcmFile = await Deno.open(filename, {
create: true,
write: true,
append: true,
});
}
// send user details to client
// when DEV_MODE is true, we send the default values 100, false, false
ws.send(
@ -63,18 +53,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
}),
);
const isDoctor = user.user_info.user_type === "doctor";
const chatHistory = await getChatHistory(
supabase,
user.user_id,
user.personality?.key ?? null,
isDoctor,
);
const firstMessage = createFirstMessage(chatHistory, payload);
console.log("firstMessage", firstMessage);
const systemPrompt = createSystemPrompt(chatHistory, payload);
let sessionStartTime: number;
let currentItemId: string | null = null;
let currentCallId: string | null = null;
@ -135,7 +113,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
// Check if the event is session.created
if (event.type === "session.created") {
console.log("session created", event);
sessionStartTime = Date.now();
sendFirstMessage(client, firstMessage);
} else if (event.type === "session.updated") {
console.log("session updated", event);
@ -361,12 +338,6 @@ export const connectToOpenAI = async (ws: WebSocket, payload: IPayload) => {
// Add more detailed close handling
ws.on("close", async (code: number, reason: string) => {
console.log(`WebSocket closed with code ${code}, reason: ${reason}`);
if (sessionStartTime) {
const sessionDuration = Math.floor(
(Date.now() - sessionStartTime) / 1000,
);
await updateUserSessionTime(supabase, user, sessionDuration);
}
client.disconnect();
if (isDev) {
if (connectionPcmFile) {

View file

@ -157,7 +157,6 @@ ${chatHistory}
`;
export const createFirstMessage = (
chatHistory: IConversation[],
payload: IPayload,
): string => {
const { timestamp, user } = payload;
@ -218,21 +217,6 @@ export const addConversation = async (
}
};
export const updateUserSessionTime = async (
supabase: SupabaseClient,
user: IUser,
sessionTime: number,
): Promise<void> => {
const { error } = await supabase
.from("users")
.update({
session_time: user.session_time + sessionTime,
})
.eq("user_id", user.user_id);
if (error) throw error;
};
/**
* Get the OpenAI API Key for the user
* @param supabase - The Supabase client

View file

@ -26,21 +26,58 @@ declare global {
user_code: string;
}
type ModelProvider = "openai" | "gemini";
type GeminiVoice =
| "Zephyr"
| "Puck"
| "Charon"
| "Kore"
| "Fenrir"
| "Leda"
| "Orus"
| "Aoede"
| "Callirrhoe"
| "Autonoe"
| "Enceladus"
| "Iapetus"
| "Umbriel"
| "Algieba"
| "Despina"
| "Erinome"
| "Algenib"
| "Rasalgethi"
| "Laomedeia"
| "Achernar"
| "Alnilam"
| "Schedar"
| "Gacrux"
| "Pulcherrima"
| "Achird"
| "Zubenelgenubi"
| "Vindemiatrix"
| "Sadachbia"
| "Sadaltager"
| "Sulafat";
type OaiVoice =
| "ash"
| "alloy"
| "echo"
| "shimmer"
| "ballad"
| "coral"
| "sage"
| "verse";
interface IPersonality {
personality_id: string;
is_doctor: boolean;
is_child_voice: boolean;
is_story: boolean;
key: string;
oai_voice:
| "ash"
| "alloy"
| "echo"
| "shimmer"
| "ballad"
| "coral"
| "sage"
| "verse";
oai_voice: OaiVoice | GeminiVoice;
provider: ModelProvider;
voice_description: string;
title: string;
subtitle: string;

View file

@ -0,0 +1,12 @@
-- Add provider column to personalities table
ALTER TABLE personalities
ADD COLUMN provider TEXT CHECK (provider IN ('openai', 'gemini')) DEFAULT 'openai';
-- Update existing records to have a default provider
UPDATE personalities
SET provider = 'openai'
WHERE provider IS NULL;
-- Make the column NOT NULL after setting defaults
ALTER TABLE personalities
ALTER COLUMN provider SET NOT NULL;