|
Add this skill
npx mdskills install sickn33/azure-ai-voicelive-tsComprehensive SDK reference with excellent code examples, event handling patterns, and authentication setup
Real-time voice AI SDK for building bidirectional voice assistants with Azure AI in Node.js and browser environments.
npm install @azure/ai-voicelive @azure/identity
# TypeScript users
npm install @types/node
Current Version: 1.0.0-beta.3
Supported Environments:
AZURE_VOICELIVE_ENDPOINT=https://.cognitiveservices.azure.com
# Optional: API key if not using Entra ID
AZURE_VOICELIVE_API_KEY=
# Optional: Logging
AZURE_LOG_LEVEL=info
import { DefaultAzureCredential } from "@azure/identity";
import { VoiceLiveClient } from "@azure/ai-voicelive";
const credential = new DefaultAzureCredential();
const endpoint = "https://your-resource.cognitiveservices.azure.com";
const client = new VoiceLiveClient(endpoint, credential);
import { AzureKeyCredential } from "@azure/core-auth";
import { VoiceLiveClient } from "@azure/ai-voicelive";
const endpoint = "https://your-resource.cognitiveservices.azure.com";
const credential = new AzureKeyCredential("your-api-key");
const client = new VoiceLiveClient(endpoint, credential);
VoiceLiveClient
└── VoiceLiveSession (WebSocket connection)
├── updateSession() → Configure session options
├── subscribe() → Event handlers (Azure SDK pattern)
├── sendAudio() → Stream audio input
├── addConversationItem() → Add messages/function outputs
└── sendEvent() → Send raw protocol events
import { DefaultAzureCredential } from "@azure/identity";
import { VoiceLiveClient } from "@azure/ai-voicelive";
const credential = new DefaultAzureCredential();
const endpoint = process.env.AZURE_VOICELIVE_ENDPOINT!;
// Create client and start session
const client = new VoiceLiveClient(endpoint, credential);
const session = await client.startSession("gpt-4o-mini-realtime-preview");
// Configure session
await session.updateSession({
modalities: ["text", "audio"],
instructions: "You are a helpful AI assistant. Respond naturally.",
voice: {
type: "azure-standard",
name: "en-US-AvaNeural",
},
turnDetection: {
type: "server_vad",
threshold: 0.5,
prefixPaddingMs: 300,
silenceDurationMs: 500,
},
inputAudioFormat: "pcm16",
outputAudioFormat: "pcm16",
});
// Subscribe to events
const subscription = session.subscribe({
onResponseAudioDelta: async (event, context) => {
// Handle streaming audio output
const audioData = event.delta;
playAudioChunk(audioData);
},
onResponseTextDelta: async (event, context) => {
// Handle streaming text
process.stdout.write(event.delta);
},
onInputAudioTranscriptionCompleted: async (event, context) => {
console.log("User said:", event.transcript);
},
});
// Send audio from microphone
function sendAudioChunk(audioBuffer: ArrayBuffer) {
session.sendAudio(audioBuffer);
}
await session.updateSession({
// Modalities
modalities: ["audio", "text"],
// System instructions
instructions: "You are a customer service representative.",
// Voice selection
voice: {
type: "azure-standard", // or "azure-custom", "openai"
name: "en-US-AvaNeural",
},
// Turn detection (VAD)
turnDetection: {
type: "server_vad", // or "azure_semantic_vad"
threshold: 0.5,
prefixPaddingMs: 300,
silenceDurationMs: 500,
},
// Audio formats
inputAudioFormat: "pcm16",
outputAudioFormat: "pcm16",
// Tools (function calling)
tools: [
{
type: "function",
name: "get_weather",
description: "Get current weather",
parameters: {
type: "object",
properties: {
location: { type: "string" }
},
required: ["location"]
}
}
],
toolChoice: "auto",
});
The SDK uses a subscription-based event handling pattern:
const subscription = session.subscribe({
// Connection lifecycle
onConnected: async (args, context) => {
console.log("Connected:", args.connectionId);
},
onDisconnected: async (args, context) => {
console.log("Disconnected:", args.code, args.reason);
},
onError: async (args, context) => {
console.error("Error:", args.error.message);
},
// Session events
onSessionCreated: async (event, context) => {
console.log("Session created:", context.sessionId);
},
onSessionUpdated: async (event, context) => {
console.log("Session updated");
},
// Audio input events (VAD)
onInputAudioBufferSpeechStarted: async (event, context) => {
console.log("Speech started at:", event.audioStartMs);
},
onInputAudioBufferSpeechStopped: async (event, context) => {
console.log("Speech stopped at:", event.audioEndMs);
},
// Transcription events
onConversationItemInputAudioTranscriptionCompleted: async (event, context) => {
console.log("User said:", event.transcript);
},
onConversationItemInputAudioTranscriptionDelta: async (event, context) => {
process.stdout.write(event.delta);
},
// Response events
onResponseCreated: async (event, context) => {
console.log("Response started");
},
onResponseDone: async (event, context) => {
console.log("Response complete");
},
// Streaming text
onResponseTextDelta: async (event, context) => {
process.stdout.write(event.delta);
},
onResponseTextDone: async (event, context) => {
console.log("\n--- Text complete ---");
},
// Streaming audio
onResponseAudioDelta: async (event, context) => {
const audioData = event.delta;
playAudioChunk(audioData);
},
onResponseAudioDone: async (event, context) => {
console.log("Audio complete");
},
// Audio transcript (what assistant said)
onResponseAudioTranscriptDelta: async (event, context) => {
process.stdout.write(event.delta);
},
// Function calling
onResponseFunctionCallArgumentsDone: async (event, context) => {
if (event.name === "get_weather") {
const args = JSON.parse(event.arguments);
const result = await getWeather(args.location);
await session.addConversationItem({
type: "function_call_output",
callId: event.callId,
output: JSON.stringify(result),
});
await session.sendEvent({ type: "response.create" });
}
},
// Catch-all for debugging
onServerEvent: async (event, context) => {
console.log("Event:", event.type);
},
});
// Clean up when done
await subscription.close();
// Define tools in session config
await session.updateSession({
modalities: ["audio", "text"],
instructions: "Help users with weather information.",
tools: [
{
type: "function",
name: "get_weather",
description: "Get current weather for a location",
parameters: {
type: "object",
properties: {
location: {
type: "string",
description: "City and state or country",
},
},
required: ["location"],
},
},
],
toolChoice: "auto",
});
// Handle function calls
const subscription = session.subscribe({
onResponseFunctionCallArgumentsDone: async (event, context) => {
if (event.name === "get_weather") {
const args = JSON.parse(event.arguments);
const weatherData = await fetchWeather(args.location);
// Send function result
await session.addConversationItem({
type: "function_call_output",
callId: event.callId,
output: JSON.stringify(weatherData),
});
// Trigger response generation
await session.sendEvent({ type: "response.create" });
}
},
});
| Voice Type | Config | Example |
|---|---|---|
| Azure Standard | { type: "azure-standard", name: "..." } | "en-US-AvaNeural" |
| Azure Custom | { type: "azure-custom", name: "...", endpointId: "..." } | Custom voice endpoint |
| Azure Personal | { type: "azure-personal", speakerProfileId: "..." } | Personal voice clone |
| OpenAI | { type: "openai", name: "..." } | "alloy", "echo", "shimmer" |
| Model | Description | Use Case |
|---|---|---|
gpt-4o-realtime-preview | GPT-4o with real-time audio | High-quality conversational AI |
gpt-4o-mini-realtime-preview | Lightweight GPT-4o | Fast, efficient interactions |
phi4-mm-realtime | Phi multimodal | Cost-effective applications |
// Server VAD (default)
turnDetection: {
type: "server_vad",
threshold: 0.5,
prefixPaddingMs: 300,
silenceDurationMs: 500,
}
// Azure Semantic VAD (smarter detection)
turnDetection: {
type: "azure_semantic_vad",
}
// Azure Semantic VAD (English optimized)
turnDetection: {
type: "azure_semantic_vad_en",
}
// Azure Semantic VAD (Multilingual)
turnDetection: {
type: "azure_semantic_vad_multilingual",
}
| Format | Sample Rate | Use Case |
|---|---|---|
pcm16 | 24kHz | Default, high quality |
pcm16-8000hz | 8kHz | Telephony |
pcm16-16000hz | 16kHz | Voice assistants |
g711_ulaw | 8kHz | Telephony (US) |
g711_alaw | 8kHz | Telephony (EU) |
| Type | Purpose |
|---|---|
VoiceLiveClient | Main client for creating sessions |
VoiceLiveSession | Active WebSocket session |
VoiceLiveSessionHandlers | Event handler interface |
VoiceLiveSubscription | Active event subscription |
ConnectionContext | Context for connection events |
SessionContext | Context for session events |
ServerEventUnion | Union of all server events |
import {
VoiceLiveError,
VoiceLiveConnectionError,
VoiceLiveAuthenticationError,
VoiceLiveProtocolError,
} from "@azure/ai-voicelive";
const subscription = session.subscribe({
onError: async (args, context) => {
const { error } = args;
if (error instanceof VoiceLiveConnectionError) {
console.error("Connection error:", error.message);
} else if (error instanceof VoiceLiveAuthenticationError) {
console.error("Auth error:", error.message);
} else if (error instanceof VoiceLiveProtocolError) {
console.error("Protocol error:", error.message);
}
},
onServerError: async (event, context) => {
console.error("Server error:", event.error?.message);
},
});
import { setLogLevel } from "@azure/logger";
// Enable verbose logging
setLogLevel("info");
// Or via environment variable
// AZURE_LOG_LEVEL=info
// Browser requires bundler (Vite, webpack, etc.)
import { VoiceLiveClient } from "@azure/ai-voicelive";
import { InteractiveBrowserCredential } from "@azure/identity";
// Use browser-compatible credential
const credential = new InteractiveBrowserCredential({
clientId: "your-client-id",
tenantId: "your-tenant-id",
});
const client = new VoiceLiveClient(endpoint, credential);
// Request microphone access
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const audioContext = new AudioContext({ sampleRate: 24000 });
// Process audio and send to session
// ... (see samples for full implementation)
DefaultAzureCredential — Never hardcode API keys["text", "audio"] for voice assistantssubscription.close() when doneInstall via CLI
npx mdskills install sickn33/azure-ai-voicelive-tsAzure AI Voicelive Ts is a free, open-source AI agent skill. |
Install Azure AI Voicelive Ts with a single command:
npx mdskills install sickn33/azure-ai-voicelive-tsThis downloads the skill files into your project and your AI agent picks them up automatically.
Azure AI Voicelive Ts works with Claude Code, Claude Desktop, Cursor, Vscode Copilot, Windsurf, Continue Dev, Codex, Gemini Cli, Amp, Roo Code, Goose, Opencode, Trae, Qodo, Command Code. Skills use the open SKILL.md format which is compatible with any AI coding agent that reads markdown instructions.