676 lines
22 KiB
JavaScript
676 lines
22 KiB
JavaScript
import { RealtimeEventHandler } from './event_handler.js';
|
|
import { RealtimeAPI } from './api.js';
|
|
import { RealtimeConversation } from './conversation.js';
|
|
import { RealtimeUtils } from './utils.js';
|
|
|
|
/**
|
|
* Valid audio formats
|
|
* @typedef {"pcm16"|"g711_ulaw"|"g711_alaw"} AudioFormatType
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} AudioTranscriptionType
|
|
* @property {"whisper-1"} model
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} TurnDetectionServerVadType
|
|
* @property {"server_vad"} type
|
|
* @property {number} [threshold]
|
|
* @property {number} [prefix_padding_ms]
|
|
* @property {number} [silence_duration_ms]
|
|
*/
|
|
|
|
/**
|
|
* Tool definitions
|
|
* @typedef {Object} ToolDefinitionType
|
|
* @property {"function"} [type]
|
|
* @property {string} name
|
|
* @property {string} description
|
|
* @property {{[key: string]: any}} parameters
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} SessionResourceType
|
|
* @property {string} [model]
|
|
* @property {string[]} [modalities]
|
|
* @property {string} [instructions]
|
|
* @property {"alloy"|"ash"|"ballad"|"coral"|"echo"|"sage"|"shimmer"|"verse"} [voice]
|
|
* @property {AudioFormatType} [input_audio_format]
|
|
* @property {AudioFormatType} [output_audio_format]
|
|
* @property {AudioTranscriptionType|null} [input_audio_transcription]
|
|
* @property {TurnDetectionServerVadType|null} [turn_detection]
|
|
* @property {ToolDefinitionType[]} [tools]
|
|
* @property {"auto"|"none"|"required"|{type:"function",name:string}} [tool_choice]
|
|
* @property {number} [temperature]
|
|
* @property {number|"inf"} [max_response_output_tokens]
|
|
*/
|
|
|
|
/**
|
|
* @typedef {"in_progress"|"completed"|"incomplete"} ItemStatusType
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} InputTextContentType
|
|
* @property {"input_text"} type
|
|
* @property {string} text
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} InputAudioContentType
|
|
* @property {"input_audio"} type
|
|
* @property {string} [audio] base64-encoded audio data
|
|
* @property {string|null} [transcript]
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} TextContentType
|
|
* @property {"text"} type
|
|
* @property {string} text
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} AudioContentType
|
|
* @property {"audio"} type
|
|
* @property {string} [audio] base64-encoded audio data
|
|
* @property {string|null} [transcript]
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} SystemItemType
|
|
* @property {string|null} [previous_item_id]
|
|
* @property {"message"} type
|
|
* @property {ItemStatusType} status
|
|
* @property {"system"} role
|
|
* @property {Array<InputTextContentType>} content
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} UserItemType
|
|
* @property {string|null} [previous_item_id]
|
|
* @property {"message"} type
|
|
* @property {ItemStatusType} status
|
|
* @property {"user"} role
|
|
* @property {Array<InputTextContentType|InputAudioContentType>} content
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} AssistantItemType
|
|
* @property {string|null} [previous_item_id]
|
|
* @property {"message"} type
|
|
* @property {ItemStatusType} status
|
|
* @property {"assistant"} role
|
|
* @property {Array<TextContentType|AudioContentType>} content
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} FunctionCallItemType
|
|
* @property {string|null} [previous_item_id]
|
|
* @property {"function_call"} type
|
|
* @property {ItemStatusType} status
|
|
* @property {string} call_id
|
|
* @property {string} name
|
|
* @property {string} arguments
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} FunctionCallOutputItemType
|
|
* @property {string|null} [previous_item_id]
|
|
* @property {"function_call_output"} type
|
|
* @property {string} call_id
|
|
* @property {string} output
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} FormattedToolType
|
|
* @property {"function"} type
|
|
* @property {string} name
|
|
* @property {string} call_id
|
|
* @property {string} arguments
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} FormattedPropertyType
|
|
* @property {Int16Array} [audio]
|
|
* @property {string} [text]
|
|
* @property {string} [transcript]
|
|
* @property {FormattedToolType} [tool]
|
|
* @property {string} [output]
|
|
* @property {any} [file]
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} FormattedItemType
|
|
* @property {string} id
|
|
* @property {string} object
|
|
* @property {"user"|"assistant"|"system"} [role]
|
|
* @property {FormattedPropertyType} formatted
|
|
*/
|
|
|
|
/**
|
|
* @typedef {SystemItemType|UserItemType|AssistantItemType|FunctionCallItemType|FunctionCallOutputItemType} BaseItemType
|
|
*/
|
|
|
|
/**
|
|
* @typedef {FormattedItemType & BaseItemType} ItemType
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} IncompleteResponseStatusType
|
|
* @property {"incomplete"} type
|
|
* @property {"interruption"|"max_output_tokens"|"content_filter"} reason
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} FailedResponseStatusType
|
|
* @property {"failed"} type
|
|
* @property {{code: string, message: string}|null} error
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} UsageType
|
|
* @property {number} total_tokens
|
|
* @property {number} input_tokens
|
|
* @property {number} output_tokens
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} ResponseResourceType
|
|
* @property {"in_progress"|"completed"|"incomplete"|"cancelled"|"failed"} status
|
|
* @property {IncompleteResponseStatusType|FailedResponseStatusType|null} status_details
|
|
* @property {ItemType[]} output
|
|
* @property {UsageType|null} usage
|
|
*/
|
|
|
|
/**
|
|
* RealtimeClient Class
|
|
* @class
|
|
*/
|
|
export class RealtimeClient extends RealtimeEventHandler {
|
|
/**
|
|
* Create a new RealtimeClient instance
|
|
* @param {{url?: string, apiKey?: string, dangerouslyAllowAPIKeyInBrowser?: boolean, debug?: boolean}} [settings]
|
|
*/
|
|
constructor({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) {
|
|
super();
|
|
this.defaultSessionConfig = {
|
|
modalities: ['text', 'audio'],
|
|
instructions: '',
|
|
voice: 'verse',
|
|
input_audio_format: 'pcm16',
|
|
output_audio_format: 'pcm16',
|
|
input_audio_transcription: null,
|
|
turn_detection: null,
|
|
tools: [],
|
|
tool_choice: 'auto',
|
|
temperature: 0.8,
|
|
max_response_output_tokens: 4096,
|
|
};
|
|
this.sessionConfig = {};
|
|
this.transcriptionModels = [
|
|
{
|
|
model: 'whisper-1',
|
|
},
|
|
];
|
|
this.defaultServerVadConfig = {
|
|
type: 'server_vad',
|
|
threshold: 0.5, // 0.0 to 1.0,
|
|
prefix_padding_ms: 300, // How much audio to include in the audio stream before the speech starts.
|
|
silence_duration_ms: 200, // How long to wait to mark the speech as stopped.
|
|
};
|
|
this.realtime = new RealtimeAPI({
|
|
url,
|
|
apiKey,
|
|
dangerouslyAllowAPIKeyInBrowser,
|
|
debug,
|
|
});
|
|
this.conversation = new RealtimeConversation();
|
|
this._resetConfig();
|
|
this._addAPIEventHandlers();
|
|
}
|
|
|
|
/**
|
|
* Resets sessionConfig and conversationConfig to default
|
|
* @private
|
|
* @returns {true}
|
|
*/
|
|
_resetConfig() {
|
|
this.sessionCreated = false;
|
|
this.tools = {};
|
|
this.sessionConfig = JSON.parse(JSON.stringify(this.defaultSessionConfig));
|
|
this.inputAudioBuffer = new Int16Array(0);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Sets up event handlers for a fully-functional application control flow
|
|
* @private
|
|
* @returns {true}
|
|
*/
|
|
_addAPIEventHandlers() {
|
|
// Event Logging handlers
|
|
this.realtime.on('client.*', (event) => {
|
|
const realtimeEvent = {
|
|
time: new Date().toISOString(),
|
|
source: 'client',
|
|
event: event,
|
|
};
|
|
this.dispatch('realtime.event', realtimeEvent);
|
|
});
|
|
this.realtime.on('server.*', (event) => {
|
|
const realtimeEvent = {
|
|
time: new Date().toISOString(),
|
|
source: 'server',
|
|
event: event,
|
|
};
|
|
this.dispatch('realtime.event', realtimeEvent);
|
|
});
|
|
|
|
// Handles session created event, can optionally wait for it
|
|
this.realtime.on(
|
|
'server.session.created',
|
|
() => (this.sessionCreated = true),
|
|
);
|
|
|
|
// Setup for application control flow
|
|
const handler = (event, ...args) => {
|
|
const { item, delta } = this.conversation.processEvent(event, ...args);
|
|
return { item, delta };
|
|
};
|
|
const handlerWithDispatch = (event, ...args) => {
|
|
const { item, delta } = handler(event, ...args);
|
|
if (item) {
|
|
// FIXME: If statement is only here because item.input_audio_transcription.completed
|
|
// can fire before `item.created`, resulting in empty item.
|
|
// This happens in VAD mode with empty audio
|
|
this.dispatch('conversation.updated', { item, delta });
|
|
}
|
|
return { item, delta };
|
|
};
|
|
const callTool = async (tool) => {
|
|
try {
|
|
const jsonArguments = JSON.parse(tool.arguments);
|
|
const toolConfig = this.tools[tool.name];
|
|
if (!toolConfig) {
|
|
throw new Error(`Tool "${tool.name}" has not been added`);
|
|
}
|
|
const result = await toolConfig.handler(jsonArguments);
|
|
this.realtime.send('conversation.item.create', {
|
|
item: {
|
|
type: 'function_call_output',
|
|
call_id: tool.call_id,
|
|
output: JSON.stringify(result),
|
|
},
|
|
});
|
|
} catch (e) {
|
|
this.realtime.send('conversation.item.create', {
|
|
item: {
|
|
type: 'function_call_output',
|
|
call_id: tool.call_id,
|
|
output: JSON.stringify({ error: e.message }),
|
|
},
|
|
});
|
|
}
|
|
this.createResponse();
|
|
};
|
|
|
|
// Handlers to update internal conversation state
|
|
this.realtime.on('server.response.created', handler);
|
|
this.realtime.on('server.response.output_item.added', handler);
|
|
this.realtime.on('server.response.content_part.added', handler);
|
|
this.realtime.on('server.input_audio_buffer.speech_started', (event) => {
|
|
handler(event);
|
|
this.dispatch('conversation.interrupted');
|
|
});
|
|
this.realtime.on(
|
|
'server.input_audio_buffer.speech_stopped',
|
|
(event) => handler(event, this.inputAudioBuffer),
|
|
);
|
|
|
|
// Handlers to update application state
|
|
this.realtime.on('server.conversation.item.created', (event) => {
|
|
const { item } = handlerWithDispatch(event);
|
|
this.dispatch('conversation.item.appended', { item });
|
|
if (item.status === 'completed') {
|
|
this.dispatch('conversation.item.completed', { item });
|
|
}
|
|
});
|
|
this.realtime.on('server.conversation.item.truncated', handlerWithDispatch);
|
|
this.realtime.on('server.conversation.item.deleted', handlerWithDispatch);
|
|
this.realtime.on(
|
|
'server.conversation.item.input_audio_transcription.completed',
|
|
handlerWithDispatch,
|
|
);
|
|
this.realtime.on(
|
|
'server.response.audio_transcript.delta',
|
|
handlerWithDispatch,
|
|
);
|
|
this.realtime.on('server.response.audio.delta', handlerWithDispatch);
|
|
this.realtime.on('server.response.text.delta', handlerWithDispatch);
|
|
this.realtime.on(
|
|
'server.response.function_call_arguments.delta',
|
|
handlerWithDispatch,
|
|
);
|
|
this.realtime.on('server.response.output_item.done', async (event) => {
|
|
const { item } = handlerWithDispatch(event);
|
|
if (item.status === 'completed') {
|
|
this.dispatch('conversation.item.completed', { item });
|
|
}
|
|
if (item.formatted.tool) {
|
|
callTool(item.formatted.tool);
|
|
}
|
|
});
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Tells us whether the realtime socket is connected and the session has started
|
|
* @returns {boolean}
|
|
*/
|
|
isConnected() {
|
|
return this.realtime.isConnected();
|
|
}
|
|
|
|
/**
|
|
* Resets the client instance entirely: disconnects and clears active config
|
|
* @returns {true}
|
|
*/
|
|
reset() {
|
|
this.disconnect();
|
|
this.clearEventHandlers();
|
|
this.realtime.clearEventHandlers();
|
|
this._resetConfig();
|
|
this._addAPIEventHandlers();
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Connects to the Realtime WebSocket API
|
|
* Updates session config and conversation config
|
|
* @returns {Promise<true>}
|
|
*/
|
|
async connect({
|
|
model = 'gpt-4o-mini-realtime-preview-2024-12-17',
|
|
turn_detection = null,
|
|
voice,
|
|
instructions,
|
|
input_audio_transcription,
|
|
}) {
|
|
if (this.isConnected()) {
|
|
throw new Error(`Already connected, use .disconnect() first`);
|
|
}
|
|
await this.realtime.connect({
|
|
model,
|
|
});
|
|
this.updateSession({
|
|
voice,
|
|
turn_detection,
|
|
instructions,
|
|
input_audio_transcription,
|
|
});
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Waits for a session.created event to be executed before proceeding
|
|
* @returns {Promise<true>}
|
|
*/
|
|
async waitForSessionCreated() {
|
|
if (!this.isConnected()) {
|
|
throw new Error(`Not connected, use .connect() first`);
|
|
}
|
|
while (!this.sessionCreated) {
|
|
await new Promise((r) => setTimeout(() => r(), 1));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Disconnects from the Realtime API and clears the conversation history
|
|
*/
|
|
disconnect() {
|
|
console.log('foobar disconnecting from realtime client');
|
|
this.sessionCreated = false;
|
|
this.realtime.isConnected() && this.realtime.disconnect();
|
|
this.conversation.clear();
|
|
}
|
|
|
|
/**
|
|
* Gets the active turn detection mode
|
|
* @returns {"server_vad"|null}
|
|
*/
|
|
getTurnDetectionType() {
|
|
return this.sessionConfig.turn_detection?.type || null;
|
|
}
|
|
|
|
/**
|
|
* Add a tool and handler
|
|
* @param {ToolDefinitionType} definition
|
|
* @param {function} handler
|
|
* @returns {{definition: ToolDefinitionType, handler: function}}
|
|
*/
|
|
addTool(definition, handler) {
|
|
if (!definition?.name) {
|
|
throw new Error(`Missing tool name in definition`);
|
|
}
|
|
const name = definition?.name;
|
|
if (this.tools[name]) {
|
|
throw new Error(
|
|
`Tool "${name}" already added. Please use .removeTool("${name}") before trying to add again.`,
|
|
);
|
|
}
|
|
if (typeof handler !== 'function') {
|
|
throw new Error(`Tool "${name}" handler must be a function`);
|
|
}
|
|
this.tools[name] = { definition, handler };
|
|
this.updateSession();
|
|
return this.tools[name];
|
|
}
|
|
|
|
/**
|
|
* Removes a tool
|
|
* @param {string} name
|
|
* @returns {true}
|
|
*/
|
|
removeTool(name) {
|
|
if (!this.tools[name]) {
|
|
throw new Error(`Tool "${name}" does not exist, can not be removed.`);
|
|
}
|
|
delete this.tools[name];
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Deletes an item
|
|
* @param {string} id
|
|
* @returns {true}
|
|
*/
|
|
deleteItem(id) {
|
|
this.realtime.send('conversation.item.delete', { item_id: id });
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Updates session configuration
|
|
* If the client is not yet connected, will save details and instantiate upon connection
|
|
* @param {SessionResourceType} [sessionConfig]
|
|
*/
|
|
updateSession({
|
|
modalities = void 0,
|
|
instructions = void 0,
|
|
voice = void 0,
|
|
input_audio_format = void 0,
|
|
output_audio_format = void 0,
|
|
input_audio_transcription = void 0,
|
|
turn_detection = void 0,
|
|
tools = void 0,
|
|
tool_choice = void 0,
|
|
temperature = void 0,
|
|
max_response_output_tokens = void 0,
|
|
} = {}) {
|
|
modalities !== void 0 && (this.sessionConfig.modalities = modalities);
|
|
instructions !== void 0 && (this.sessionConfig.instructions = instructions);
|
|
voice !== void 0 && (this.sessionConfig.voice = voice);
|
|
input_audio_format !== void 0 &&
|
|
(this.sessionConfig.input_audio_format = input_audio_format);
|
|
output_audio_format !== void 0 &&
|
|
(this.sessionConfig.output_audio_format = output_audio_format);
|
|
input_audio_transcription !== void 0 &&
|
|
(this.sessionConfig.input_audio_transcription = input_audio_transcription);
|
|
turn_detection !== void 0 &&
|
|
(this.sessionConfig.turn_detection = turn_detection);
|
|
tools !== void 0 && (this.sessionConfig.tools = tools);
|
|
tool_choice !== void 0 && (this.sessionConfig.tool_choice = tool_choice);
|
|
temperature !== void 0 && (this.sessionConfig.temperature = temperature);
|
|
max_response_output_tokens !== void 0 &&
|
|
(this.sessionConfig.max_response_output_tokens = max_response_output_tokens);
|
|
// Load tools from tool definitions + already loaded tools
|
|
const useTools = [].concat(
|
|
(tools || []).map((toolDefinition) => {
|
|
const definition = {
|
|
type: 'function',
|
|
...toolDefinition,
|
|
};
|
|
if (this.tools[definition?.name]) {
|
|
throw new Error(
|
|
`Tool "${definition?.name}" has already been defined`,
|
|
);
|
|
}
|
|
return definition;
|
|
}),
|
|
Object.keys(this.tools).map((key) => {
|
|
return {
|
|
type: 'function',
|
|
...this.tools[key].definition,
|
|
};
|
|
}),
|
|
);
|
|
const session = { ...this.sessionConfig };
|
|
session.tools = useTools;
|
|
if (this.realtime.isConnected()) {
|
|
this.realtime.send('session.update', { session });
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Sends user message content and generates a response
|
|
* @param {Array<InputTextContentType|InputAudioContentType>} content
|
|
* @returns {true}
|
|
*/
|
|
sendUserMessageContent(content = []) {
|
|
if (content.length) {
|
|
for (const c of content) {
|
|
if (c.type === 'input_audio') {
|
|
if (c.audio instanceof ArrayBuffer || c.audio instanceof Int16Array) {
|
|
c.audio = RealtimeUtils.arrayBufferToBase64(c.audio);
|
|
}
|
|
}
|
|
}
|
|
this.realtime.send('conversation.item.create', {
|
|
item: {
|
|
type: 'message',
|
|
role: 'user',
|
|
content,
|
|
},
|
|
});
|
|
}
|
|
this.createResponse();
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Appends user audio to the existing audio buffer
|
|
* @param {Int16Array|ArrayBuffer} arrayBuffer
|
|
* @returns {true}
|
|
*/
|
|
appendInputAudio(arrayBuffer) {
|
|
if (arrayBuffer.byteLength > 0) {
|
|
this.realtime.send('input_audio_buffer.append', {
|
|
audio: RealtimeUtils.arrayBufferToBase64(arrayBuffer),
|
|
});
|
|
this.inputAudioBuffer = RealtimeUtils.mergeInt16Arrays(
|
|
this.inputAudioBuffer,
|
|
arrayBuffer,
|
|
);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Forces a model response generation
|
|
* @returns {true}
|
|
*/
|
|
createResponse() {
|
|
if (
|
|
this.getTurnDetectionType() === null &&
|
|
this.inputAudioBuffer.byteLength > 0
|
|
) {
|
|
this.realtime.send('input_audio_buffer.commit');
|
|
this.conversation.queueInputAudio(this.inputAudioBuffer);
|
|
this.inputAudioBuffer = new Int16Array(0);
|
|
}
|
|
this.realtime.send('response.create');
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Cancels the ongoing server generation and truncates ongoing generation, if applicable
|
|
* If no id provided, will simply call `cancel_generation` command
|
|
* @param {string} id The id of the message to cancel
|
|
* @param {number} [sampleCount] The number of samples to truncate past for the ongoing generation
|
|
* @returns {{item: (AssistantItemType | null)}}
|
|
*/
|
|
cancelResponse(id, sampleCount = 0) {
|
|
if (!id) {
|
|
this.realtime.send('response.cancel');
|
|
return { item: null };
|
|
} else if (id) {
|
|
const item = this.conversation.getItem(id);
|
|
if (!item) {
|
|
throw new Error(`Could not find item "${id}"`);
|
|
}
|
|
if (item.type !== 'message') {
|
|
throw new Error(`Can only cancelResponse messages with type "message"`);
|
|
} else if (item.role !== 'assistant') {
|
|
throw new Error(
|
|
`Can only cancelResponse messages with role "assistant"`,
|
|
);
|
|
}
|
|
this.realtime.send('response.cancel');
|
|
const audioIndex = item.content.findIndex((c) => c.type === 'audio');
|
|
if (audioIndex === -1) {
|
|
throw new Error(`Could not find audio on item to cancel`);
|
|
}
|
|
this.realtime.send('conversation.item.truncate', {
|
|
item_id: id,
|
|
content_index: audioIndex,
|
|
audio_end_ms: Math.floor(
|
|
(sampleCount / this.conversation.defaultFrequency) * 1000,
|
|
),
|
|
});
|
|
return { item };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Utility for waiting for the next `conversation.item.appended` event to be triggered by the server
|
|
* @returns {Promise<{item: ItemType}>}
|
|
*/
|
|
async waitForNextItem() {
|
|
const event = await this.waitForNext('conversation.item.appended');
|
|
const { item } = event;
|
|
return { item };
|
|
}
|
|
|
|
/**
|
|
* Utility for waiting for the next `conversation.item.completed` event to be triggered by the server
|
|
* @returns {Promise<{item: ItemType}>}
|
|
*/
|
|
async waitForNextCompletedItem() {
|
|
const event = await this.waitForNext('conversation.item.completed');
|
|
const { item } = event;
|
|
return { item };
|
|
}
|
|
}
|