adding grok
This commit is contained in:
parent
bd7c9c8b83
commit
4c329073d8
15 changed files with 442 additions and 67 deletions
14
README.md
14
README.md
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
# 👾 ElatoAI: Realtime Speech AI Agents for ESP32
|
||||
|
||||
Realtime AI Speech powered by SoTA AI voice models like **OpenAI Realtime API**, **Eleven Labs AI Agents**, **Gemini Live API**, **Hume AI EVI-4**, on ESP32, with Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted conversations globally.
|
||||
Realtime AI Speech powered by SoTA AI voice models like **OpenAI Realtime API**, **Eleven Labs AI Agents**, **Gemini Live API**, **Hume AI EVI-4**, **xAI's Grok Voice Agent API** on ESP32, with Secure WebSockets, and Deno Edge Functions for >15-minute uninterrupted conversations globally.
|
||||
|
||||
<div align="center" style="margin: 20px 0;">
|
||||
<!-- <a href="https://www.kickstarter.com/projects/elatoai/elato-make-toys-talk-with-ai-voices" target="_blank">
|
||||
|
|
@ -49,11 +49,12 @@ Realtime AI Speech powered by SoTA AI voice models like **OpenAI Realtime API**,
|
|||
</div>
|
||||
|
||||
|
||||
## ⚡️ `With SOTA Realtime AI Speech Models on an ESP32`
|
||||
## ⚡️ `With SoTA Realtime AI Speech Models on an ESP32`
|
||||
|
||||
<div align="center" class="flex flex-row gap-4">
|
||||
<img src="assets/openai.png" alt="OpenAI Realtime API" width="45%">
|
||||
<img src="assets/gemini.png" alt="Gemini Live API" width="45%">
|
||||
<img src="assets/grok.svg" alt="Grok AI" width="45%">
|
||||
<img src="assets/humeai.avif" alt="Hume AI EVI4" width="45%">
|
||||
<img src="assets/elevenlabs.svg" alt="Eleven Labs AI Agents" width="45%">
|
||||
</div>
|
||||
|
|
@ -87,7 +88,7 @@ Control your ESP32 AI device from your phone with the Elato AI webapp.
|
|||
|
||||
## `🌟 Full feature list`
|
||||
|
||||
1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime API, Gemini's Live API, Eleven Labs Conversational AI Agents and Hume AI EVI4.
|
||||
1. **Realtime Speech-to-Speech**: Instant speech conversion powered by OpenAI's Realtime API, Gemini's Live API, xAI's Grok Voice Agent API, Eleven Labs Conversational AI Agents and Hume AI EVI4.
|
||||
2. **Create Custom AI Agents**: Create custom agents with different personalities and voices.
|
||||
3. **Customizable Voices**: Choose from a variety of voices and personalities.
|
||||
4. **Secure WebSockets**: Reliable, encrypted WebSocket communication.
|
||||
|
|
@ -145,7 +146,7 @@ cp .env.example .env.local
|
|||
|
||||
# In .env.local, set your environment variables
|
||||
# NEXT_PUBLIC_SUPABASE_ANON_KEY=<your-supabase-anon-key>
|
||||
# OPENAI_API_KEY=<your-openai-api-key>
|
||||
# OPENAI_API_KEY=<your-openai-api-key> (to test OpenAI on the browser)
|
||||
|
||||
# Run the development server
|
||||
npm run dev
|
||||
|
|
@ -171,6 +172,7 @@ cp .env.example .env
|
|||
# SUPABASE_KEY=<your-supabase-anon-key>
|
||||
# OPENAI_API_KEY=<your-openai-api-key>
|
||||
# GEMINI_API_KEY=<your-gemini-api-key>
|
||||
# XAI_API_KEY=<your-xai-api-key>
|
||||
# ELEVENLABS_API_KEY=<your-elevenlabs-api-key>
|
||||
# HUME_API_KEY=<your-hume-api-key>
|
||||
|
||||
|
|
@ -226,10 +228,12 @@ flowchart TD
|
|||
ESP32[ESP32 Device] -->|WebSocket| Edge[Deno Edge Function]
|
||||
Edge -->|OpenAI API| OpenAI[OpenAI Realtime API]
|
||||
Edge -->|Gemini API| Gemini[Gemini Live API]
|
||||
Edge -->|xAI API| xAI[xAI Grok Voice Agent API]
|
||||
Edge -->|ElevenLabs API| ElevenLabs[ElevenLabs AI Agents]
|
||||
Edge -->|Hume API| Hume[Hume AI EVI4]
|
||||
OpenAI --> Edge
|
||||
Gemini --> Edge
|
||||
xAI --> Edge
|
||||
ElevenLabs --> Edge
|
||||
Hume --> Edge
|
||||
Edge -->|WebSocket| ESP32
|
||||
|
|
@ -308,6 +312,8 @@ lib_deps =
|
|||
5. ~~Plug in Eleven Labs API for voice generation~~
|
||||
6. Add Azure OpenAI Support (easy pickens)
|
||||
7. Add Cartesia Support (easy pickens)
|
||||
8. Add Amazon Nova Support
|
||||
9. Add Deepgram
|
||||
|
||||
We welcome contributions
|
||||
- Fork this repository.
|
||||
|
|
|
|||
1
assets/grok.svg
Normal file
1
assets/grok.svg
Normal file
|
|
@ -0,0 +1 @@
|
|||
<svg width="2500" height="938" fill="none" xmlns="http://www.w3.org/2000/svg" class="opacity-80 hover:opacity-100 fill-black dark:fill-white [&>path]:hidden sm:[&>path]:block [&>#mark]:block [&>#furigana]:opacity-60 [&>#subtitle]:opacity-60" data--h-bstatus="0OBSERVED" viewBox="0.3640000000000012 0.5000000000000024 87.27199999999999 31.999999999999996"><path d="M76.446 24.708V8.416h2.576v10.752l5.447-6.257h3.122l-4.9 5.362 4.945 6.435H84.56l-4.006-5.53-1.532-.01v5.54h-2.576zM68.636 24.982c-3.829 0-5.902-2.716-5.902-6.184 0-3.491 2.073-6.184 5.902-6.184 3.852 0 5.903 2.693 5.903 6.184 0 3.468-2.051 6.184-5.903 6.184zm-3.213-6.184c0 2.692 1.458 4.039 3.213 4.039 1.778 0 3.214-1.347 3.214-4.04 0-2.692-1.436-4.06-3.214-4.06-1.755 0-3.213 1.368-3.213 4.06zM55.566 24.708v-9.926l2.165-1.871h4.604v2.19H58.14v9.607h-2.575zM45.719 25.009c-4.909 0-7.836-3.564-7.836-8.424 0-4.906 3.032-8.557 7.931-8.557 3.83 0 6.633 1.962 7.294 5.613h-2.94c-.434-2.076-2.166-3.24-4.353-3.24-3.533 0-5.083 3.058-5.083 6.184 0 3.126 1.55 6.16 5.083 6.16 3.373 0 4.854-2.441 4.968-4.472H45.7v-2.362h7.68l-.013 1.235c0 4.59-1.87 7.863-7.65 7.863zM13.237 21.04l11.082-8.19c.543-.4 1.32-.244 1.578.38 1.363 3.288.754 7.241-1.957 9.955-2.71 2.714-6.482 3.31-9.93 1.954l-3.765 1.745c5.401 3.697 11.96 2.782 16.059-1.324 3.251-3.255 4.258-7.692 3.317-11.693l.008.009c-1.365-5.878.336-8.227 3.82-13.031.082-.114.165-.228.247-.345l-4.585 4.59v-.014L13.234 21.044M10.95 23.031c-3.877-3.707-3.208-9.446.1-12.755 2.446-2.449 6.454-3.448 9.952-1.979L24.76 6.56c-.677-.49-1.545-1.017-2.54-1.387A12.465 12.465 0 0 0 8.675 7.901c-3.519 3.523-4.625 8.94-2.725 13.561 1.42 3.454-.907 5.898-3.251 8.364-.83.874-1.664 1.748-2.335 2.674l10.583-9.466" fill="currentColor" data--h-bstatus="0OBSERVED"/></svg>
|
||||
|
After Width: | Height: | Size: 1.8 KiB |
|
|
@ -33,11 +33,11 @@ volatile bool sleepRequested = false;
|
|||
*/
|
||||
|
||||
#ifdef DEV_MODE
|
||||
const char *ws_server = "192.168.1.37";
|
||||
const char *ws_server = "172.20.10.2";
|
||||
const uint16_t ws_port = 8000;
|
||||
const char *ws_path = "/";
|
||||
// Backend server details
|
||||
const char *backend_server = "192.168.1.37";
|
||||
const char *backend_server = "172.20.10.2";
|
||||
const uint16_t backend_port = 3000;
|
||||
|
||||
#elif defined(PROD_MODE)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ import { v4 as uuidv4 } from 'uuid';
|
|||
import { toast } from "@/components/ui/use-toast";
|
||||
import { useRouter } from "next/navigation";
|
||||
import { z } from "zod";
|
||||
import { emotionOptions, geminiVoices, openaiVoices, r2UrlAudio } from "@/lib/data";
|
||||
import { emotionOptions, geminiVoices, grokVoices, openaiVoices, r2UrlAudio } from "@/lib/data";
|
||||
import EmojiComponent from "./EmojiComponent";
|
||||
import { PitchFactors } from "@/lib/utils";
|
||||
import { Slider } from "@/components/ui/slider";
|
||||
|
|
@ -26,7 +26,7 @@ interface SettingsDashboardProps {
|
|||
}
|
||||
|
||||
const formSchema = z.object({
|
||||
provider: z.enum(["openai", "gemini"]),
|
||||
provider: z.enum(["openai", "gemini", "grok"]),
|
||||
title: z.string().min(2, "Minimum 2 characters").max(50, "Maximum 50 characters"),
|
||||
description: z.string().min(50, "Minimum 50 characters").max(200, "Maximum 200 characters"),
|
||||
prompt: z.string().min(100, "Minimum 100 characters").max(1000, "Maximum 1000 characters"),
|
||||
|
|
@ -66,6 +66,7 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
|
|||
const [touchedFields, setTouchedFields] = useState<Record<string, boolean>>({});
|
||||
const [formErrors, setFormErrors] = useState<Partial<Record<keyof FormData | 'features', string>>>({});
|
||||
const [previewingVoice, setPreviewingVoice] = useState<string | null>(null);
|
||||
const [expandedProvider, setExpandedProvider] = useState<ModelProvider | null>("openai");
|
||||
|
||||
const handleBlur = (field: keyof FormData | 'features') => {
|
||||
// Mark the field as touched
|
||||
|
|
@ -249,6 +250,19 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
|
|||
}
|
||||
}
|
||||
|
||||
const getProviderBadge = (provider: ModelProvider) => {
|
||||
if (provider === "openai") {
|
||||
return { label: "OpenAI", className: "bg-emerald-500 text-white" };
|
||||
}
|
||||
if (provider === "gemini") {
|
||||
return { label: "Gemini", className: "bg-purple-500 text-white" };
|
||||
}
|
||||
if (provider === "grok") {
|
||||
return { label: "Grok", className: "bg-slate-900 text-white" };
|
||||
}
|
||||
return { label: provider, className: "bg-gray-600 text-white" };
|
||||
};
|
||||
|
||||
const Heading = () => {
|
||||
return (
|
||||
<div className="flex flex-col gap-2">
|
||||
|
|
@ -273,60 +287,98 @@ const SettingsDashboard: React.FC<SettingsDashboardProps> = ({
|
|||
<div className="space-y-4">
|
||||
<Label htmlFor="voice">Pick a voice</Label>
|
||||
<p className="text-sm text-gray-500">
|
||||
Click a voice to preview how it sounds.
|
||||
Choose from a list of voices and model providers to create your AI character.
|
||||
</p>
|
||||
|
||||
<div className="overflow-x-auto px-2">
|
||||
<div className="flex gap-3 w-max py-2">
|
||||
{[...openaiVoices, ...geminiVoices].map((voice: VoiceType) => (
|
||||
<div
|
||||
key={voice.id}
|
||||
className={`relative rounded-xl border-2 p-4 transition-all cursor-pointer hover:scale-[1.02] hover:shadow-lg w-48 flex-shrink-0 ${formData.voice === voice.id
|
||||
? `border-blue-500 shadow-lg ${voice.color} ring-2 ring-blue-200`
|
||||
: `border-gray-200 hover:border-gray-300 ${voice.color} hover:shadow-md`
|
||||
}`}
|
||||
onClick={() => {
|
||||
setFormData(prev => ({
|
||||
<div className="grid grid-cols-3 gap-3">
|
||||
{([
|
||||
{ provider: "openai" as ModelProvider, label: "OpenAI" },
|
||||
{ provider: "gemini" as ModelProvider, label: "Gemini" },
|
||||
{ provider: "grok" as ModelProvider, label: "Grok" },
|
||||
]).map((p) => (
|
||||
<button
|
||||
key={p.provider}
|
||||
type="button"
|
||||
className={`text-left bg-white shadow-md rounded-xl border-2 p-4 transition-all hover:shadow-md ${expandedProvider === p.provider
|
||||
? "border-blue-500 ring-2 ring-blue-200"
|
||||
: "border-gray-200 hover:border-gray-300"
|
||||
}`}
|
||||
onClick={() => {
|
||||
setExpandedProvider(prev => prev === p.provider ? null : p.provider);
|
||||
setFormData(prev => {
|
||||
const switchingProvider = prev.provider !== p.provider;
|
||||
return {
|
||||
...prev,
|
||||
provider: voice.provider as ModelProvider,
|
||||
voice: voice.id
|
||||
}));
|
||||
previewVoice(voice);
|
||||
}}
|
||||
>
|
||||
<div className="flex flex-col">
|
||||
<div className="flex flex-col items-center gap-3">
|
||||
<div className="text-3xl">
|
||||
<EmojiComponent emoji={voice.emoji} />
|
||||
</div>
|
||||
<div className="flex flex-col text-center">
|
||||
<span className="font-semibold text-gray-900">{voice.name}</span>
|
||||
<span className="text-xs text-gray-600 mt-1">{voice.description}</span>
|
||||
<div className={`inline-flex items-center justify-center px-2 py-1 rounded-full text-xs font-medium mt-2 ${voice.provider === 'openai' ? 'bg-emerald-500 text-white' : 'bg-purple-500 text-white'
|
||||
}`}>
|
||||
{voice.provider === 'openai' ? 'OpenAI' : 'Gemini'}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{previewingVoice === voice.id && (
|
||||
<div className="absolute top-3 right-3">
|
||||
<div className="animate-pulse text-blue-600 bg-white rounded-full p-2 shadow-lg">
|
||||
<Volume2 size={16} />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{formData.voice === voice.id && (
|
||||
<div className="absolute -top-2 -right-2">
|
||||
<div className="bg-blue-500 text-white rounded-full p-1.5 shadow-lg">
|
||||
<Check size={12} />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
provider: p.provider,
|
||||
voice: switchingProvider ? "" : prev.voice,
|
||||
};
|
||||
});
|
||||
}}
|
||||
>
|
||||
<div className="flex flex-col gap-1">
|
||||
<div className="flex flex-col sm:flex-row gap-2 items-center justify-between">
|
||||
<span className="font-semibold text-gray-900">{p.label}</span>
|
||||
<span className="text-xs text-gray-500">
|
||||
{p.provider === "openai" ? openaiVoices.length : p.provider === "gemini" ? geminiVoices.length : grokVoices.length} voices
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
|
||||
{expandedProvider && (
|
||||
<div className="overflow-x-auto px-2">
|
||||
<div className="flex gap-3 w-max py-2">
|
||||
{(expandedProvider === "openai" ? openaiVoices : expandedProvider === "gemini" ? geminiVoices : grokVoices).map((voice: VoiceType) => (
|
||||
<div
|
||||
key={voice.id}
|
||||
className={`relative rounded-xl border-2 p-4 transition-all cursor-pointer hover:scale-[1.02] hover:shadow-lg w-48 flex-shrink-0 ${formData.voice === voice.id
|
||||
? `border-blue-500 shadow-lg ${voice.color} ring-2 ring-blue-200`
|
||||
: `border-gray-200 hover:border-gray-300 ${voice.color} hover:shadow-md`
|
||||
}`}
|
||||
onClick={() => {
|
||||
setFormData(prev => ({
|
||||
...prev,
|
||||
provider: voice.provider as ModelProvider,
|
||||
voice: voice.id
|
||||
}));
|
||||
previewVoice(voice);
|
||||
}}
|
||||
>
|
||||
<div className="flex flex-col">
|
||||
<div className="flex flex-col items-center gap-3">
|
||||
<div className="text-3xl">
|
||||
<EmojiComponent emoji={voice.emoji} />
|
||||
</div>
|
||||
<div className="flex flex-col text-center">
|
||||
<span className="font-semibold text-gray-900">{voice.name}</span>
|
||||
<span className="text-xs text-gray-600 mt-1">{voice.description}</span>
|
||||
<div className={`inline-flex items-center justify-center px-2 py-1 rounded-full text-xs font-medium mt-2 ${getProviderBadge(voice.provider as ModelProvider).className}`}>
|
||||
{getProviderBadge(voice.provider as ModelProvider).label}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{previewingVoice === voice.id && (
|
||||
<div className="absolute top-3 right-3">
|
||||
<div className="animate-pulse text-blue-600 bg-white rounded-full p-2 shadow-lg">
|
||||
<Volume2 size={16} />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{formData.voice === voice.id && (
|
||||
<div className="absolute -top-2 -right-2">
|
||||
<div className="bg-blue-500 text-white rounded-full p-1.5 shadow-lg">
|
||||
<Check size={12} />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* ElevenLabs Alternative */}
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ export default function ProductsSection() {
|
|||
<div className="container px-4 md:px-6 max-w-screen-sm mx-auto">
|
||||
<div className="text-center mb-10">
|
||||
<h2 className="text-3xl md:text-4xl font-bold mb-6 text-gray-800">
|
||||
Our Product
|
||||
Our Products
|
||||
</h2>
|
||||
<p className="text-lg text-gray-600 mt-2">
|
||||
Everything you need to bring conversational AI to your world
|
||||
|
|
|
|||
|
|
@ -4,15 +4,16 @@ import React from "react";
|
|||
|
||||
interface YoutubeDemoProps {
|
||||
caption: string;
|
||||
youtubeId: string;
|
||||
}
|
||||
|
||||
|
||||
export default function YoutubeDemo({ caption }: YoutubeDemoProps) {
|
||||
export default function YoutubeDemo({ caption, youtubeId }: YoutubeDemoProps) {
|
||||
return <div className="w-full max-w-3xl mx-auto">
|
||||
<div className="relative" style={{ paddingBottom: '56.25%' }}>
|
||||
<iframe
|
||||
className="absolute top-0 left-0 w-full h-full rounded-xl shadow-lg"
|
||||
src="https://www.youtube.com/embed/o1eIAwVll5I"
|
||||
src={`https://www.youtube.com/embed/${youtubeId}`}
|
||||
title="Elato Demo"
|
||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
|
||||
allowFullScreen
|
||||
|
|
|
|||
|
|
@ -124,7 +124,10 @@ export default async function LandingPage() {
|
|||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<YoutubeDemo caption="Elato AI ESP32-S3 Demo" />
|
||||
<YoutubeDemo caption="Elato AI ESP32-S3 Demo" youtubeId="o1eIAwVll5I" />
|
||||
<br/><br />
|
||||
<YoutubeDemo caption="How to run ElatoAI on your own device" youtubeId="bXrNRpGOJWw" />
|
||||
|
||||
{/* Products Section */}
|
||||
<ProductsSection />
|
||||
|
||||
|
|
|
|||
|
|
@ -106,6 +106,44 @@ export const openaiVoices: VoiceType[] = [
|
|||
},
|
||||
];
|
||||
|
||||
export const grokVoices: VoiceType[] = [
|
||||
{
|
||||
id: "Ara",
|
||||
name: "Ara",
|
||||
description: "Bright",
|
||||
color: "bg-yellow-100",
|
||||
provider: "grok",
|
||||
},
|
||||
{
|
||||
id: "Eve",
|
||||
name: "Eve",
|
||||
description: "Upbeat",
|
||||
color: "bg-orange-100",
|
||||
provider: "grok",
|
||||
},
|
||||
{
|
||||
id: "Leo",
|
||||
name: "Leo",
|
||||
description: "Confident",
|
||||
color: "bg-blue-100",
|
||||
provider: "grok",
|
||||
},
|
||||
{
|
||||
id: "Rex",
|
||||
name: "Rex",
|
||||
description: "Direct",
|
||||
color: "bg-gray-100",
|
||||
provider: "grok",
|
||||
},
|
||||
{
|
||||
id: "Sal",
|
||||
name: "Sal",
|
||||
description: "Warm",
|
||||
color: "bg-green-100",
|
||||
provider: "grok",
|
||||
}
|
||||
];
|
||||
|
||||
export const geminiVoices: VoiceType[] = [
|
||||
{
|
||||
id: "Zephyr",
|
||||
|
|
|
|||
21
frontend-nextjs/types/types.d.ts
vendored
21
frontend-nextjs/types/types.d.ts
vendored
|
|
@ -114,9 +114,24 @@ declare global {
|
|||
description: string;
|
||||
color: string;
|
||||
emoji?: string;
|
||||
}
|
||||
| {
|
||||
provider: "grok";
|
||||
id: GrokVoice;
|
||||
name: string;
|
||||
description: string;
|
||||
color: string;
|
||||
emoji?: string;
|
||||
};
|
||||
|
||||
type ModelProvider = "openai" | "gemini" | "elevenlabs" | "hume";
|
||||
type ModelProvider = "openai" | "gemini" | "grok" | "elevenlabs" | "hume";
|
||||
|
||||
type GrokVoice =
|
||||
| "Ara"
|
||||
| "Eve"
|
||||
| "Leo"
|
||||
| "Rex"
|
||||
| "Sal";
|
||||
|
||||
type GeminiVoice =
|
||||
| "Zephyr"
|
||||
|
|
@ -162,8 +177,8 @@ declare global {
|
|||
|
||||
// characters <-> personalities table
|
||||
/**
|
||||
* oai_voice is for the name of any voice. both gemini and openai use this.
|
||||
* forgot to refactor this, update it for your setup
|
||||
* oai_voice is for the name of any voice. grok, gemini and openai use this.
|
||||
* I forgot to refactor this, please consider updating it for your setup :)
|
||||
*/
|
||||
interface IPersonality {
|
||||
personality_id?: string;
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ ENCRYPTION_KEY=<ENCRYPTION_KEY>
|
|||
# Model provider API Keys
|
||||
OPENAI_API_KEY=<OPENAI_API_KEY>
|
||||
GEMINI_API_KEY=<GEMINI_API_KEY>
|
||||
XAI_API_KEY=<XAI_API_KEY>
|
||||
ELEVENLABS_API_KEY=<ELEVENLABS_API_KEY>
|
||||
HUME_API_KEY=<HUME_API_KEY>
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ import { connectToOpenAI } from "./models/openai.ts";
|
|||
import { connectToGemini } from "./models/gemini.ts";
|
||||
import { connectToElevenLabs } from "./models/elevenlabs.ts";
|
||||
import { connectToHume } from "./models/hume.ts";
|
||||
import { connectToGrok } from "./models/grok.ts";
|
||||
|
||||
const server = createServer();
|
||||
|
||||
|
|
@ -58,7 +59,7 @@ wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => {
|
|||
ws.send(
|
||||
JSON.stringify({
|
||||
type: "auth",
|
||||
volume_control: user.device?.volume ?? 20,
|
||||
volume_control: user.device?.volume ?? 100,
|
||||
is_ota: user.device?.is_ota ?? false,
|
||||
is_reset: user.device?.is_reset ?? false,
|
||||
pitch_factor: user.personality?.pitch_factor ?? 1,
|
||||
|
|
@ -84,6 +85,15 @@ wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => {
|
|||
systemPrompt,
|
||||
);
|
||||
break;
|
||||
case "grok":
|
||||
await connectToGrok(
|
||||
ws,
|
||||
payload,
|
||||
connectionPcmFile,
|
||||
firstMessage,
|
||||
systemPrompt,
|
||||
);
|
||||
break;
|
||||
case "elevenlabs":
|
||||
const agentId = user.personality?.oai_voice ?? "";
|
||||
|
||||
|
|
@ -145,7 +155,7 @@ server.on("upgrade", async (req, socket, head) => {
|
|||
});
|
||||
});
|
||||
|
||||
if (isDev) { // deno run -A --env-file=.env main.ts
|
||||
if (isDev) { // RUN WITH: deno run -A --env-file=.env main.ts
|
||||
const HOST = Deno.env.get("HOST") || "0.0.0.0";
|
||||
const PORT = Deno.env.get("PORT") || "8000";
|
||||
server.listen(Number(PORT), HOST, () => {
|
||||
|
|
|
|||
240
server-deno/models/grok.ts
Normal file
240
server-deno/models/grok.ts
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
|
||||
import { Buffer } from 'node:buffer';
|
||||
import type { RawData } from 'npm:@types/ws';
|
||||
import { WebSocket } from 'npm:ws';
|
||||
import { addConversation, getDeviceInfo } from '../supabase.ts';
|
||||
import { encoder, FRAME_SIZE, isDev, xaiApiKey } from '../utils.ts';
|
||||
|
||||
const XAI_REALTIME_URL = 'wss://api.x.ai/v1/realtime';
|
||||
const DEFAULT_GROK_VOICE = 'Ara';
|
||||
|
||||
export const connectToGrok = async (
|
||||
ws: WebSocket,
|
||||
payload: IPayload,
|
||||
connectionPcmFile: Deno.FsFile | null,
|
||||
firstMessage: string,
|
||||
systemPrompt: string,
|
||||
) => {
|
||||
const { user, supabase } = payload;
|
||||
|
||||
if (!xaiApiKey) {
|
||||
throw new Error('XAI_API_KEY is not set');
|
||||
}
|
||||
|
||||
const voice = user.personality?.oai_voice ?? DEFAULT_GROK_VOICE;
|
||||
|
||||
const grokWs = new WebSocket(XAI_REALTIME_URL, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${xaiApiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
let isConnected = false;
|
||||
const messageQueue: RawData[] = [];
|
||||
|
||||
let createdSent = false;
|
||||
let outputTranscript = '';
|
||||
let audioRemainder = Buffer.alloc(0);
|
||||
|
||||
const sendResponseCreated = async () => {
|
||||
try {
|
||||
const device = await getDeviceInfo(supabase, user.user_id);
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
type: 'server',
|
||||
msg: 'RESPONSE.CREATED',
|
||||
volume_control: device?.volume ?? 100,
|
||||
}),
|
||||
);
|
||||
} catch {
|
||||
ws.send(JSON.stringify({ type: 'server', msg: 'RESPONSE.CREATED' }));
|
||||
}
|
||||
};
|
||||
|
||||
const sendFirstMessage = () => {
|
||||
if (!firstMessage) return;
|
||||
grokWs.send(
|
||||
JSON.stringify({
|
||||
type: 'conversation.item.create',
|
||||
item: {
|
||||
type: 'message',
|
||||
role: 'user',
|
||||
content: [{ type: 'input_text', text: firstMessage }],
|
||||
},
|
||||
}),
|
||||
);
|
||||
grokWs.send(JSON.stringify({ type: 'response.create' }));
|
||||
};
|
||||
|
||||
grokWs.on('open', () => {
|
||||
isConnected = true;
|
||||
|
||||
grokWs.send(
|
||||
JSON.stringify({
|
||||
type: 'session.update',
|
||||
session: {
|
||||
voice,
|
||||
instructions: systemPrompt,
|
||||
turn_detection: { type: "server_vad" },
|
||||
audio: {
|
||||
input: { format: { type: 'audio/pcm', rate: 16000 } },
|
||||
output: { format: { type: 'audio/pcm', rate: 24000 } },
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
sendFirstMessage();
|
||||
|
||||
while (messageQueue.length > 0) {
|
||||
const queuedMessage = messageQueue.shift();
|
||||
if (queuedMessage) {
|
||||
messageHandler(queuedMessage, true);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
grokWs.on('message', async (data: Buffer) => {
|
||||
let event: any;
|
||||
try {
|
||||
event = JSON.parse(data.toString('utf-8'));
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (event.type) {
|
||||
case 'response.created':
|
||||
if (!createdSent) {
|
||||
await sendResponseCreated();
|
||||
createdSent = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'response.output_audio_transcript.delta':
|
||||
if (typeof event.delta === 'string') {
|
||||
outputTranscript += event.delta;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'response.output_audio.delta':
|
||||
if (typeof event.delta === 'string') {
|
||||
const pcmChunk = Buffer.from(event.delta, 'base64');
|
||||
audioRemainder = Buffer.concat([audioRemainder, pcmChunk]);
|
||||
|
||||
while (audioRemainder.length >= FRAME_SIZE) {
|
||||
const frame = audioRemainder.subarray(0, FRAME_SIZE);
|
||||
audioRemainder = audioRemainder.subarray(FRAME_SIZE);
|
||||
try {
|
||||
const packet = encoder.encode(frame);
|
||||
ws.send(packet);
|
||||
} catch {
|
||||
// Skip frame
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'conversation.item.input_audio_transcription.completed':
|
||||
if (typeof event.transcript === 'string' && event.transcript.length > 0) {
|
||||
await addConversation(supabase, 'user', event.transcript, user);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'input_audio_buffer.committed':
|
||||
ws.send(JSON.stringify({ type: 'server', msg: 'AUDIO.COMMITTED' }));
|
||||
break;
|
||||
|
||||
case 'response.done':
|
||||
if (outputTranscript) {
|
||||
await addConversation(supabase, 'assistant', outputTranscript, user);
|
||||
outputTranscript = '';
|
||||
}
|
||||
ws.send(JSON.stringify({ type: 'server', msg: 'RESPONSE.COMPLETE' }));
|
||||
createdSent = false;
|
||||
break;
|
||||
|
||||
case 'error':
|
||||
ws.send(JSON.stringify({ type: 'server', msg: 'RESPONSE.ERROR' }));
|
||||
createdSent = false;
|
||||
break;
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Error processing Grok event:', err);
|
||||
ws.send(JSON.stringify({ type: 'server', msg: 'RESPONSE.ERROR' }));
|
||||
createdSent = false;
|
||||
}
|
||||
});
|
||||
|
||||
grokWs.on('close', () => {
|
||||
ws.close();
|
||||
});
|
||||
|
||||
grokWs.on('error', (error: any) => {
|
||||
console.error('Grok WebSocket error:', error);
|
||||
ws.send(JSON.stringify({ type: 'server', msg: 'RESPONSE.ERROR' }));
|
||||
});
|
||||
|
||||
const messageHandler = async (data: RawData, isBinary: boolean) => {
|
||||
if (isBinary) {
|
||||
const base64Data = (data as Buffer).toString('base64');
|
||||
grokWs.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: base64Data }));
|
||||
|
||||
if (isDev && connectionPcmFile) {
|
||||
await connectionPcmFile.write(data as Buffer);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let message: any;
|
||||
try {
|
||||
message = JSON.parse((data as Buffer).toString('utf-8'));
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
|
||||
if (message?.type !== 'instruction') return;
|
||||
|
||||
if (message.msg === 'end_of_speech') {
|
||||
grokWs.send(JSON.stringify({ type: 'input_audio_buffer.commit' }));
|
||||
grokWs.send(JSON.stringify({ type: 'response.create' }));
|
||||
grokWs.send(JSON.stringify({ type: 'input_audio_buffer.clear' }));
|
||||
} else if (message.msg === 'INTERRUPT') {
|
||||
grokWs.send(JSON.stringify({ type: 'input_audio_buffer.clear' }));
|
||||
}
|
||||
};
|
||||
|
||||
ws.on('message', (data: RawData, isBinary: boolean) => {
|
||||
if (!isConnected) {
|
||||
messageQueue.push(data);
|
||||
} else {
|
||||
messageHandler(data, isBinary);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('error', (error: any) => {
|
||||
console.error('ESP32 WebSocket error:', error);
|
||||
grokWs.close();
|
||||
});
|
||||
|
||||
ws.on('close', async (code: number, reason: string) => {
|
||||
console.log(`ESP32 WebSocket closed with code ${code}, reason: ${reason}`);
|
||||
grokWs.close();
|
||||
if (isDev && connectionPcmFile) {
|
||||
connectionPcmFile.close();
|
||||
}
|
||||
});
|
||||
|
||||
return new Promise<void>((resolve, reject) => {
|
||||
const timeout = setTimeout(() => reject(new Error('Grok connection timeout')), 10000);
|
||||
grokWs.on('open', () => {
|
||||
clearTimeout(timeout);
|
||||
resolve();
|
||||
});
|
||||
grokWs.on('error', (error: any) => {
|
||||
clearTimeout(timeout);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
};
|
||||
9
server-deno/types.d.ts
vendored
9
server-deno/types.d.ts
vendored
|
|
@ -26,7 +26,14 @@ declare global {
|
|||
user_code: string;
|
||||
}
|
||||
|
||||
type ModelProvider = "openai" | "gemini" | "elevenlabs" | "hume";
|
||||
type ModelProvider = "openai" | "gemini" | "elevenlabs" | "hume" | "grok";
|
||||
|
||||
type GrokVoice =
|
||||
| "Ara"
|
||||
| "Eve"
|
||||
| "Leo"
|
||||
| "Rex"
|
||||
| "Sal"
|
||||
|
||||
type GeminiVoice =
|
||||
| "Zephyr"
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ export const openaiApiKey = Deno.env.get("OPENAI_API_KEY");
|
|||
export const geminiApiKey = Deno.env.get("GEMINI_API_KEY");
|
||||
export const elevenLabsApiKey = Deno.env.get("ELEVENLABS_API_KEY");
|
||||
export const humeApiKey = Deno.env.get('HUME_API_KEY');
|
||||
export const xaiApiKey = Deno.env.get('XAI_API_KEY');
|
||||
|
||||
export { encoder, FRAME_SIZE };
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
-- Add provider column to personalities table
|
||||
ALTER TABLE personalities
|
||||
ADD COLUMN provider TEXT CHECK (provider IN ('openai', 'gemini', 'elevenlabs', 'hume')) DEFAULT 'openai';
|
||||
ADD COLUMN provider TEXT CHECK (provider IN ('openai', 'gemini', 'grok', 'elevenlabs', 'hume')) DEFAULT 'openai';
|
||||
|
||||
-- Update existing records to have a default provider
|
||||
UPDATE personalities
|
||||
|
|
|
|||
Loading…
Reference in a new issue