/** * Voice Mode Hook * * Clean React integration with the voice state machine. * Each effect responds to a state by performing an action and sending an event back. */ import { useEffect, useRef } from 'react'; import { useMachine } from '@xstate/react'; import { voiceMachine } from '@/lib/voice-machine'; interface UseVoiceModeProps { messages: any[]; status: 'ready' | 'submitted' | 'streaming' | 'error'; onSubmit: (text: string) => void; } export function useVoiceMode({ messages, status, onSubmit }: UseVoiceModeProps) { const [state, send] = useMachine(voiceMachine); // Refs for side effects const audioRef = useRef(null); const mediaRecorderRef = useRef(null); const socketRef = useRef(null); // Helper: Get text from message const getMessageText = (msg: any): string => { if ('parts' in msg && Array.isArray(msg.parts)) { const textPart = msg.parts.find((p: any) => p.type === 'text'); return textPart?.text || ''; } return msg.content || ''; }; // STATE: checkingForGreeting // Action: Check if there's an unspoken AI message, send event useEffect(() => { if (!state.matches('checkingForGreeting')) return; const assistantMessages = messages.filter((m) => m.role === 'assistant'); if (assistantMessages.length === 0) { send({ type: 'START_LISTENING' }); return; } const latest = assistantMessages[assistantMessages.length - 1]; if (state.context.lastSpokenMessageId === latest.id) { send({ type: 'START_LISTENING' }); return; } const text = getMessageText(latest); if (text) { send({ type: 'AI_RESPONSE_RECEIVED', messageId: latest.id, text }); } else { send({ type: 'START_LISTENING' }); } }, [state, messages, send]); // STATE: listening // Action: Start microphone and WebSocket useEffect(() => { if (!state.matches('listening')) return; let cleanup: (() => void) | null = null; (async () => { try { // Get Deepgram token const response = await fetch('/api/voice-token', { method: 'POST' }); const data = await response.json(); if (data.error) throw new Error(data.error); // Get microphone const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); // Connect WebSocket with VAD and utterance end detection const socket = new WebSocket( 'wss://api.deepgram.com/v1/listen?interim_results=true&punctuate=true&vad_events=true&utterance_end_ms=1000', ['token', data.key] ); socketRef.current = socket; socket.onopen = () => { const mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' }); mediaRecorderRef.current = mediaRecorder; mediaRecorder.ondataavailable = (event) => { if (event.data.size > 0 && socket.readyState === WebSocket.OPEN) { socket.send(event.data); } }; mediaRecorder.start(250); }; socket.onmessage = (event) => { const data = JSON.parse(event.data); // Handle UtteranceEnd - Deepgram detected end of utterance if (data.type === 'UtteranceEnd') { console.log('[Voice] Utterance ended, sending UTTERANCE_END event'); send({ type: 'UTTERANCE_END' }); return; } // Handle transcript events if (!data.channel?.alternatives) return; const transcript = data.channel.alternatives[0]?.transcript || ''; if (!transcript) return; // Detect if user started or resumed speaking based on receiving transcript console.log('[Voice] Transcript received:', transcript); send({ type: 'USER_STARTED_SPEAKING' }); // Append finalized phrases to the transcript if (data.is_final) { send({ type: 'FINALIZED_PHRASE', phrase: transcript }); } }; cleanup = () => { socket.close(); mediaRecorderRef.current?.stop(); stream.getTracks().forEach((track) => track.stop()); }; } catch (error) { console.error('[Voice] Error starting listening:', error); send({ type: 'ERROR', message: String(error) }); } })(); return cleanup || undefined; }, [state, send]); // STATE: timingOut is now handled by XState's built-in `after` delay // No useEffect needed - the state machine automatically transitions after 3 seconds // STATE: submittingUser // Action: Submit transcript, send event when done useEffect(() => { if (!state.matches('submittingUser')) return; const transcript = state.context.transcript.trim(); if (!transcript) { send({ type: 'ERROR', message: 'No transcript to submit' }); return; } // Close WebSocket if (socketRef.current) { socketRef.current.close(); socketRef.current = null; } if (mediaRecorderRef.current) { mediaRecorderRef.current.stop(); mediaRecorderRef.current = null; } // Submit onSubmit(transcript); send({ type: 'USER_MESSAGE_SUBMITTED' }); }, [state, send, onSubmit]); // STATE: waitingForAI // Action: Poll messages for AI response useEffect(() => { if (!state.matches('waitingForAI')) return; if (status !== 'ready') return; const transcript = state.context.transcript.trim(); if (!transcript) return; // Check if AI has responded const lastMsg = messages[messages.length - 1]; const secondLastMsg = messages[messages.length - 2]; if ( lastMsg && lastMsg.role === 'assistant' && secondLastMsg && secondLastMsg.role === 'user' && getMessageText(secondLastMsg) === transcript ) { const text = getMessageText(lastMsg); if (text) { send({ type: 'AI_RESPONSE_RECEIVED', messageId: lastMsg.id, text }); } } }, [state, messages, status, send]); // STATE: generatingTTS // Action: Generate TTS audio useEffect(() => { if (!state.matches('generatingTTS')) return; // Get the AI text from the event that triggered this state const assistantMessages = messages.filter((m) => m.role === 'assistant'); const latest = assistantMessages[assistantMessages.length - 1]; if (!latest) return; const text = getMessageText(latest); if (!text) return; (async () => { try { const response = await fetch('/api/tts', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ text }), }); if (!response.ok) throw new Error('TTS generation failed'); const audioBlob = await response.blob(); const audioUrl = URL.createObjectURL(audioBlob); send({ type: 'TTS_GENERATION_COMPLETE', audioUrl }); } catch (error) { console.error('[Voice] TTS generation error:', error); send({ type: 'ERROR', message: String(error) }); } })(); }, [state, messages, send]); // STATE: playingTTS // Action: Play audio, send event when finished useEffect(() => { if (!state.matches('playingTTS')) { // Stop audio if we leave this state if (audioRef.current) { audioRef.current.pause(); audioRef.current.currentTime = 0; } return; } const audioUrl = state.context.audioUrl; if (!audioUrl) { send({ type: 'ERROR', message: 'No audio URL' }); return; } // Create or reuse audio element if (!audioRef.current) { audioRef.current = new Audio(); } audioRef.current.src = audioUrl; audioRef.current.onended = () => { URL.revokeObjectURL(audioUrl); send({ type: 'TTS_PLAYBACK_FINISHED' }); }; audioRef.current.onerror = () => { URL.revokeObjectURL(audioUrl); send({ type: 'ERROR', message: 'Audio playback error' }); }; audioRef.current.play().catch((error) => { console.error('[Voice] Audio play error:', error); send({ type: 'ERROR', message: String(error) }); }); return () => { if (audioRef.current) { audioRef.current.pause(); audioRef.current.currentTime = 0; } }; }, [state, send]); return { state, send, transcript: state.context.transcript, error: state.context.error, }; }