Files
app/components/MicrophoneRecorder.tsx
Albert e4c5960d7a feat: Step 7 & 9 - AI Chat + Voice client integration
Implement AI-powered chat interface with voice input capabilities.

Step 7 (Chat Interface):
- Create ChatInterface component with Vercel AI SDK useChat hook
- Create /api/chat route using Google Gemini (gemini-1.5-flash)
- Implement thoughtful interviewer system prompt
- Add real-time message streaming
- Auto-scroll to latest messages

Step 9 (Voice Client):
- Create MicrophoneRecorder component
- Integrate real-time voice transcription via Deepgram
- Direct WebSocket connection using temporary tokens
- Real-time transcript display in chat input
- Auto-submit on speech_final event
- Add @tabler/icons-react for microphone icons

Architecture:
- Client requests temporary Deepgram token from /api/voice-token
- MediaRecorder captures audio in 250ms chunks
- WebSocket sends audio directly to Deepgram
- Transcripts update chat input in real-time
- Final transcript auto-submits to AI chat

Security:
- Deepgram API key never exposed to client
- Temporary tokens expire in 60 seconds
- Chat requires authentication via SurrealDB JWT

Testing:
- Add magnitude test for voice recording flow
- Tests cover happy path with mocked WebSocket

Known Issue:
- Page compilation needs debugging (useChat import path verified)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-09 00:27:40 +00:00

155 lines
4.2 KiB
TypeScript

'use client';
import { ActionIcon, Tooltip } from '@mantine/core';
import { IconMicrophone, IconMicrophoneOff } from '@tabler/icons-react';
import { useState, useRef } from 'react';
// Define the shape of the Deepgram transcript
interface DeepgramTranscript {
channel: {
alternatives: Array<{
transcript: string;
}>;
};
is_final: boolean;
speech_final: boolean;
}
type Props = {
/**
* Callback function to update the chat input with the new transcript.
* @param transcript - The full, combined transcript
*/
onTranscriptUpdate: (transcript: string) => void;
/**
* Callback function to signal the final transcript for this "thought".
* @param transcript - The final, punctuated transcript
*/
onTranscriptFinalized: (transcript: string) => void;
};
export function MicrophoneRecorder({ onTranscriptUpdate, onTranscriptFinalized }: Props) {
const [isRecording, setIsRecording] = useState(false);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const socketRef = useRef<WebSocket | null>(null);
// Store the combined transcript for the current utterance
const transcriptRef = useRef<string>('');
const stopRecording = () => {
if (mediaRecorderRef.current) {
mediaRecorderRef.current.stop();
mediaRecorderRef.current = null;
}
if (socketRef.current) {
socketRef.current.close();
socketRef.current = null;
}
setIsRecording(false);
// Finalize the transcript
if (transcriptRef.current) {
onTranscriptFinalized(transcriptRef.current);
}
transcriptRef.current = '';
};
const startRecording = async () => {
transcriptRef.current = ''; // Reset transcript
try {
// 1. Get the temporary Deepgram key
const response = await fetch('/api/voice-token', { method: 'POST' });
const data = await response.json();
if (data.error) {
throw new Error(data.error);
}
const { key } = data;
// 2. Access the microphone
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
// 3. Open direct WebSocket to Deepgram
const socket = new WebSocket(
'wss://api.deepgram.com/v1/listen?interim_results=true&punctuate=true',
['token', key]
);
socketRef.current = socket;
socket.onopen = () => {
// 4. Create MediaRecorder
const mediaRecorder = new MediaRecorder(stream, {
mimeType: 'audio/webm',
});
mediaRecorderRef.current = mediaRecorder;
// 5. Send audio chunks on data available
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0 && socket.readyState === WebSocket.OPEN) {
socket.send(event.data);
}
};
// Start recording and chunking audio every 250ms
mediaRecorder.start(250);
setIsRecording(true);
};
// 6. Receive transcripts
socket.onmessage = (event) => {
const data = JSON.parse(event.data) as DeepgramTranscript;
const transcript = data.channel.alternatives[0]?.transcript || '';
if (transcript) {
transcriptRef.current = transcript;
onTranscriptUpdate(transcript);
}
// If it's a "speech final" event, this utterance is done.
if (data.speech_final) {
stopRecording();
}
};
socket.onclose = () => {
// Clean up stream
stream.getTracks().forEach((track) => track.stop());
if (isRecording) {
stopRecording(); // Ensure cleanup
}
};
socket.onerror = (err) => {
console.error('WebSocket error:', err);
stopRecording();
};
} catch (error) {
console.error('Error starting recording:', error);
setIsRecording(false);
}
};
const handleToggleRecord = () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
};
return (
<Tooltip label={isRecording ? 'Stop Recording' : 'Start Recording'}>
<ActionIcon
onClick={handleToggleRecord}
size="lg"
radius="xl"
color={isRecording ? 'red' : 'gray'}
variant="filled"
>
{isRecording ? <IconMicrophoneOff /> : <IconMicrophone />}
</ActionIcon>
</Tooltip>
);
}