AIvoices/server-deno/models/gemini.ts
2025-06-11 14:57:14 +01:00

302 lines
7.2 KiB
TypeScript

import { Buffer } from "node:buffer";
import type { WebSocketServer as _WebSocketServer } from "npm:@types/ws";
import {
EndSensitivity,
GoogleGenAI,
LiveConnectConfig,
LiveServerMessage,
Modality,
Session,
} from "npm:@google/genai";
import { encoder, FRAME_SIZE, geminiApiKey, isDev } from "../utils.ts";
import { addConversation } from "../supabase.ts";
export const connectToGemini = async (
ws: WebSocket,
payload: IPayload,
connectionPcmFile: Deno.FsFile | null,
firstMessage: string,
systemPrompt: string,
) => {
const { user, supabase } = payload;
const { oai_voice, pitch_factor } = user.personality ?? {
oai_voice: "Sadachbia",
provider: "gemini",
pitch_factor: 1,
};
const { is_ota, is_reset, volume } = user.device ?? {
is_ota: false,
is_reset: false,
volume: 10,
};
// Send user details to client
ws.send(
JSON.stringify({
type: "auth",
volume_control: volume,
is_ota: is_ota,
is_reset: is_reset,
pitch_factor: pitch_factor,
}),
);
console.log(`Connecting with Gemini key "${geminiApiKey.slice(0, 3)}..."`);
// Initialize Google GenAI
const ai = new GoogleGenAI({ apiKey: geminiApiKey });
const model = "gemini-2.5-flash-preview-native-audio-dialog";
const config: LiveConnectConfig = {
responseModalities: [Modality.AUDIO],
systemInstruction: systemPrompt,
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: oai_voice,
},
},
},
realtimeInputConfig: {
automaticActivityDetection: {
disabled: false, // Keep VAD enabled
endOfSpeechSensitivity: EndSensitivity.END_SENSITIVITY_HIGH, // How sensitive to detect speech ending
silenceDurationMs: 100, // How much silence before considering speech ended
},
},
outputAudioTranscription: {},
inputAudioTranscription: {},
};
// Response queue for handling Google's callback-based responses
const responseQueue: LiveServerMessage[] = [];
let geminiSession: Session | null = null;
async function waitMessage() {
let done = false;
let message: LiveServerMessage | undefined = undefined;
while (!done) {
message = responseQueue.shift();
if (message) {
done = true;
} else {
await new Promise((resolve) => setTimeout(resolve, 10));
}
}
return message;
}
async function handleTurn() {
const turns: any[] = [];
let done = false;
while (!done) {
const message = await waitMessage();
turns.push(message);
if (
message.serverContent
) {
if (message.serverContent.generationComplete) {
ws.send(JSON.stringify({
type: "server",
msg: "RESPONSE.CREATED",
}));
done = true;
}
if (message.serverContent.turnComplete) {
ws.send(
JSON.stringify({
type: "server",
msg: "AUDIO.COMMITTED",
}),
);
}
}
}
return turns;
}
async function processGeminiTurns() {
try {
console.log("Processing Gemini turns");
while (geminiSession) {
const turns = await handleTurn();
// Combine all audio data from this turn
const combinedAudio = turns.reduce(
(acc: number[], turn: any) => {
if (turn.data) {
const buffer = Buffer.from(turn.data, "base64");
const intArray = new Int16Array(
buffer.buffer,
buffer.byteOffset,
buffer.byteLength /
Int16Array.BYTES_PER_ELEMENT,
);
return acc.concat(Array.from(intArray));
}
return acc;
},
[],
);
if (combinedAudio.length > 0) {
// Convert back to buffer and send to client
const audioBuffer = new Int16Array(combinedAudio);
const buffer = Buffer.from(audioBuffer.buffer);
// PREVIEW AUDIO
// const wf = new WaveFile();
// wf.fromScratch(1, SAMPLE_RATE, "16", audioBuffer);
// const filename = `gemini_response_${Date.now()}.wav`;
// await Deno.writeFile(filename, wf.toBuffer());
// console.log(`Audio saved as ${filename}`);
// SEND TO ESP32
// Send audio in chunks to client
for (
let offset = 0;
offset < buffer.length;
offset += FRAME_SIZE
) {
const frame = buffer.subarray(
offset,
offset + FRAME_SIZE,
);
try {
const encodedPacket = encoder.encode(frame);
ws.send(encodedPacket);
} catch (_e) {
// Skip this frame but continue with others
}
}
}
// // Handle text responses if any
let outputTranscriptionText = "";
let inputTranscriptionText = "";
for (const turn of turns as LiveServerMessage[]) {
if (
turn.serverContent &&
turn.serverContent.outputTranscription
) {
outputTranscriptionText +=
turn.serverContent.outputTranscription.text;
}
if (
turn.serverContent &&
turn.serverContent.inputTranscription
) {
inputTranscriptionText +=
turn.serverContent.inputTranscription.text;
}
}
// Send completion signal
ws.send(JSON.stringify({
type: "server",
msg: "RESPONSE.COMPLETE",
}));
// Add user transcription to supabase
await addConversation(
supabase,
"user",
inputTranscriptionText,
user,
);
// Add assistant transcription to supabase
await addConversation(
supabase,
"assistant",
outputTranscriptionText,
user,
);
}
} catch (error) {
console.error("Error processing Gemini turns:", error);
}
}
// Connect to Google Gemini Live
try {
geminiSession = await ai.live.connect({
model: model,
callbacks: {
onopen: function () {
console.log("Gemini session opened");
},
onmessage: function (message: LiveServerMessage) {
responseQueue.push(message);
},
onerror: function (e: any) {
console.error("Gemini error:", e.message);
ws.send(
JSON.stringify({
type: "server",
msg: "RESPONSE.ERROR",
}),
);
},
onclose: function (e: any) {
console.log("Gemini session closed:", e.reason);
},
},
config: config,
});
console.log("Connected to Gemini successfully!");
// Send first message if available
const inputTurns = [{
role: "user",
parts: [{ text: firstMessage }],
}];
geminiSession?.sendClientContent({ turns: inputTurns });
processGeminiTurns();
} catch (e: unknown) {
console.log(`Error connecting to Gemini: ${e}`);
ws.close();
return;
}
ws.on("message", (data: any, isBinary: boolean) => {
try {
if (isBinary) {
// Handle binary audio data from ESP32
const base64Data = data.toString("base64");
if (isDev && connectionPcmFile) {
connectionPcmFile.write(data);
}
// Send audio to Gemini
geminiSession?.sendRealtimeInput({
audio: {
data: base64Data,
mimeType: "audio/pcm;rate=24000", // Gemini expects 16kHz but 24kHz is fine
},
});
}
} catch (e: unknown) {
console.error("Error handling message:", (e as Error).message);
}
});
ws.on("error", (error: any) => {
console.error("WebSocket error:", error);
geminiSession?.close();
});
ws.on("close", async (code: number, reason: string) => {
console.log(`WebSocket closed with code ${code}, reason: ${reason}`);
geminiSession?.close();
if (isDev && connectionPcmFile) {
connectionPcmFile.close();
console.log("Closed debug audio file.");
}
});
};