After testing, discovered that temporary tokens from grantToken() fail with WebSocket connections. Switched to using API key directly, which is the standard approach for client-side Deepgram WebSocket connections. Changes: - Simplified voice-token route to return API key directly - Added comprehensive logging to MicrophoneRecorder for debugging - Documented security considerations and mitigation strategies - Verified working end-to-end voice transcription This matches Deepgram's official Next.js starter pattern and is the recommended approach for client-side real-time transcription. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
167 lines
4.8 KiB
TypeScript
167 lines
4.8 KiB
TypeScript
'use client';
|
|
|
|
import { ActionIcon, Tooltip } from '@mantine/core';
|
|
import { IconMicrophone, IconMicrophoneOff } from '@tabler/icons-react';
|
|
import { useState, useRef } from 'react';
|
|
|
|
// Define the shape of the Deepgram transcript
|
|
interface DeepgramTranscript {
|
|
channel: {
|
|
alternatives: Array<{
|
|
transcript: string;
|
|
}>;
|
|
};
|
|
is_final: boolean;
|
|
speech_final: boolean;
|
|
}
|
|
|
|
type Props = {
|
|
/**
|
|
* Callback function to update the chat input with the new transcript.
|
|
* @param transcript - The full, combined transcript
|
|
*/
|
|
onTranscriptUpdate: (transcript: string) => void;
|
|
/**
|
|
* Callback function to signal the final transcript for this "thought".
|
|
* @param transcript - The final, punctuated transcript
|
|
*/
|
|
onTranscriptFinalized: (transcript: string) => void;
|
|
};
|
|
|
|
export function MicrophoneRecorder({ onTranscriptUpdate, onTranscriptFinalized }: Props) {
|
|
const [isRecording, setIsRecording] = useState(false);
|
|
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
|
const socketRef = useRef<WebSocket | null>(null);
|
|
|
|
// Store the combined transcript for the current utterance
|
|
const transcriptRef = useRef<string>('');
|
|
|
|
const stopRecording = () => {
|
|
if (mediaRecorderRef.current) {
|
|
mediaRecorderRef.current.stop();
|
|
mediaRecorderRef.current = null;
|
|
}
|
|
if (socketRef.current) {
|
|
socketRef.current.close();
|
|
socketRef.current = null;
|
|
}
|
|
setIsRecording(false);
|
|
|
|
// Finalize the transcript
|
|
if (transcriptRef.current) {
|
|
onTranscriptFinalized(transcriptRef.current);
|
|
}
|
|
transcriptRef.current = '';
|
|
};
|
|
|
|
const startRecording = async () => {
|
|
transcriptRef.current = ''; // Reset transcript
|
|
try {
|
|
// 1. Get the temporary Deepgram key
|
|
const response = await fetch('/api/voice-token', { method: 'POST' });
|
|
const data = await response.json();
|
|
|
|
if (data.error) {
|
|
throw new Error(data.error);
|
|
}
|
|
|
|
const { key } = data;
|
|
|
|
// 2. Access the microphone
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
|
|
// 3. Open direct WebSocket to Deepgram
|
|
const socket = new WebSocket(
|
|
'wss://api.deepgram.com/v1/listen?interim_results=true&punctuate=true',
|
|
['token', key]
|
|
);
|
|
socketRef.current = socket;
|
|
|
|
socket.onopen = () => {
|
|
console.log('[MicrophoneRecorder] ✓ WebSocket connected to Deepgram');
|
|
|
|
// 4. Create MediaRecorder
|
|
const mediaRecorder = new MediaRecorder(stream, {
|
|
mimeType: 'audio/webm',
|
|
});
|
|
mediaRecorderRef.current = mediaRecorder;
|
|
|
|
// 5. Send audio chunks on data available
|
|
mediaRecorder.ondataavailable = (event) => {
|
|
if (event.data.size > 0 && socket.readyState === WebSocket.OPEN) {
|
|
console.log('[MicrophoneRecorder] Sending audio chunk:', event.data.size, 'bytes');
|
|
socket.send(event.data);
|
|
}
|
|
};
|
|
|
|
// Start recording and chunking audio every 250ms
|
|
mediaRecorder.start(250);
|
|
setIsRecording(true);
|
|
console.log('[MicrophoneRecorder] ✓ Recording started');
|
|
};
|
|
|
|
// 6. Receive transcripts
|
|
socket.onmessage = (event) => {
|
|
const data = JSON.parse(event.data) as DeepgramTranscript;
|
|
const transcript = data.channel.alternatives[0]?.transcript || '';
|
|
|
|
console.log('[MicrophoneRecorder] Received from Deepgram:', {
|
|
transcript,
|
|
is_final: data.is_final,
|
|
speech_final: data.speech_final
|
|
});
|
|
|
|
if (transcript) {
|
|
transcriptRef.current = transcript;
|
|
onTranscriptUpdate(transcript);
|
|
console.log('[MicrophoneRecorder] Updated transcript:', transcript);
|
|
}
|
|
|
|
// If it's a "speech final" event, this utterance is done.
|
|
if (data.speech_final) {
|
|
console.log('[MicrophoneRecorder] Speech finalized, stopping recording');
|
|
stopRecording();
|
|
}
|
|
};
|
|
|
|
socket.onclose = () => {
|
|
// Clean up stream
|
|
stream.getTracks().forEach((track) => track.stop());
|
|
if (isRecording) {
|
|
stopRecording(); // Ensure cleanup
|
|
}
|
|
};
|
|
|
|
socket.onerror = (err) => {
|
|
console.error('WebSocket error:', err);
|
|
stopRecording();
|
|
};
|
|
} catch (error) {
|
|
console.error('Error starting recording:', error);
|
|
setIsRecording(false);
|
|
}
|
|
};
|
|
|
|
const handleToggleRecord = () => {
|
|
if (isRecording) {
|
|
stopRecording();
|
|
} else {
|
|
startRecording();
|
|
}
|
|
};
|
|
|
|
return (
|
|
<Tooltip label={isRecording ? 'Stop Recording' : 'Start Recording'}>
|
|
<ActionIcon
|
|
onClick={handleToggleRecord}
|
|
size="lg"
|
|
radius="xl"
|
|
color={isRecording ? 'red' : 'gray'}
|
|
variant="filled"
|
|
>
|
|
{isRecording ? <IconMicrophoneOff /> : <IconMicrophone />}
|
|
</ActionIcon>
|
|
</Tooltip>
|
|
);
|
|
}
|