Files
app/components/MicrophoneRecorder.tsx
Albert 0b632a31eb fix: Implement working voice transcription with Deepgram API key
After testing, discovered that temporary tokens from grantToken() fail
with WebSocket connections. Switched to using API key directly, which
is the standard approach for client-side Deepgram WebSocket connections.

Changes:
- Simplified voice-token route to return API key directly
- Added comprehensive logging to MicrophoneRecorder for debugging
- Documented security considerations and mitigation strategies
- Verified working end-to-end voice transcription

This matches Deepgram's official Next.js starter pattern and is the
recommended approach for client-side real-time transcription.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-09 06:13:45 +00:00

167 lines
4.8 KiB
TypeScript

'use client';
import { ActionIcon, Tooltip } from '@mantine/core';
import { IconMicrophone, IconMicrophoneOff } from '@tabler/icons-react';
import { useState, useRef } from 'react';
// Define the shape of the Deepgram transcript
interface DeepgramTranscript {
channel: {
alternatives: Array<{
transcript: string;
}>;
};
is_final: boolean;
speech_final: boolean;
}
type Props = {
/**
* Callback function to update the chat input with the new transcript.
* @param transcript - The full, combined transcript
*/
onTranscriptUpdate: (transcript: string) => void;
/**
* Callback function to signal the final transcript for this "thought".
* @param transcript - The final, punctuated transcript
*/
onTranscriptFinalized: (transcript: string) => void;
};
export function MicrophoneRecorder({ onTranscriptUpdate, onTranscriptFinalized }: Props) {
const [isRecording, setIsRecording] = useState(false);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const socketRef = useRef<WebSocket | null>(null);
// Store the combined transcript for the current utterance
const transcriptRef = useRef<string>('');
const stopRecording = () => {
if (mediaRecorderRef.current) {
mediaRecorderRef.current.stop();
mediaRecorderRef.current = null;
}
if (socketRef.current) {
socketRef.current.close();
socketRef.current = null;
}
setIsRecording(false);
// Finalize the transcript
if (transcriptRef.current) {
onTranscriptFinalized(transcriptRef.current);
}
transcriptRef.current = '';
};
const startRecording = async () => {
transcriptRef.current = ''; // Reset transcript
try {
// 1. Get the temporary Deepgram key
const response = await fetch('/api/voice-token', { method: 'POST' });
const data = await response.json();
if (data.error) {
throw new Error(data.error);
}
const { key } = data;
// 2. Access the microphone
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
// 3. Open direct WebSocket to Deepgram
const socket = new WebSocket(
'wss://api.deepgram.com/v1/listen?interim_results=true&punctuate=true',
['token', key]
);
socketRef.current = socket;
socket.onopen = () => {
console.log('[MicrophoneRecorder] ✓ WebSocket connected to Deepgram');
// 4. Create MediaRecorder
const mediaRecorder = new MediaRecorder(stream, {
mimeType: 'audio/webm',
});
mediaRecorderRef.current = mediaRecorder;
// 5. Send audio chunks on data available
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0 && socket.readyState === WebSocket.OPEN) {
console.log('[MicrophoneRecorder] Sending audio chunk:', event.data.size, 'bytes');
socket.send(event.data);
}
};
// Start recording and chunking audio every 250ms
mediaRecorder.start(250);
setIsRecording(true);
console.log('[MicrophoneRecorder] ✓ Recording started');
};
// 6. Receive transcripts
socket.onmessage = (event) => {
const data = JSON.parse(event.data) as DeepgramTranscript;
const transcript = data.channel.alternatives[0]?.transcript || '';
console.log('[MicrophoneRecorder] Received from Deepgram:', {
transcript,
is_final: data.is_final,
speech_final: data.speech_final
});
if (transcript) {
transcriptRef.current = transcript;
onTranscriptUpdate(transcript);
console.log('[MicrophoneRecorder] Updated transcript:', transcript);
}
// If it's a "speech final" event, this utterance is done.
if (data.speech_final) {
console.log('[MicrophoneRecorder] Speech finalized, stopping recording');
stopRecording();
}
};
socket.onclose = () => {
// Clean up stream
stream.getTracks().forEach((track) => track.stop());
if (isRecording) {
stopRecording(); // Ensure cleanup
}
};
socket.onerror = (err) => {
console.error('WebSocket error:', err);
stopRecording();
};
} catch (error) {
console.error('Error starting recording:', error);
setIsRecording(false);
}
};
const handleToggleRecord = () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
};
return (
<Tooltip label={isRecording ? 'Stop Recording' : 'Start Recording'}>
<ActionIcon
onClick={handleToggleRecord}
size="lg"
radius="xl"
color={isRecording ? 'red' : 'gray'}
variant="filled"
>
{isRecording ? <IconMicrophoneOff /> : <IconMicrophone />}
</ActionIcon>
</Tooltip>
);
}