Files
app/docs/steps/step-09.md
2025-11-08 12:44:39 +00:00

8.8 KiB

File: COMMIT_09_VOICE_CLIENT.md

Commit 9: Real-time Voice: Client Integration

Objective

Integrate the client-side microphone recording. This component will:

  1. Call the /api/voice-token route (from Commit 08) to get a temporary key.
  2. Use navigator.mediaDevices.getUserMedia to access the microphone.24
  3. Open a direct WebSocket to Deepgram using the temporary key.25
  4. Use MediaRecorder to capture audio chunks.24
  5. Stream audio chunks to Deepgram and receive transcripts back in real-time.
  6. Feed the received transcripts into the useChat input.

Implementation Specification

1. Create components/MicrophoneRecorder.tsx

Create a new component at /components/MicrophoneRecorder.tsx:

TypeScript

'use client';

import { ActionIcon, Tooltip } from '@mantine/core';
import { IconMicrophone, IconMicrophoneOff } from '@tabler/icons-react';
import { useState, useRef } from 'react';

// Define the shape of the Deepgram transcript
interface DeepgramTranscript {
channel: {
alternatives: {
transcript: string;
};
};
is_final: boolean;
speech_final: boolean;
}

type Props = {
/**
* Callback function to update the chat input with the new transcript.
* @param transcript The full, combined transcript.
*/
onTranscriptUpdate: (transcript: string) => void;
/**
* Callback function to signal the final transcript for this "thought".
* @param transcript The final, punctuated transcript.
*/
onTranscriptFinalized: (transcript: string) => void;
};

export function MicrophoneRecorder({ onTranscriptUpdate, onTranscriptFinalized }: Props) {
const = useState(false);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const socketRef = useRef<WebSocket | null>(null);

// Store the combined transcript for the current utterance
const transcriptRef = useRef<string>('');

const stopRecording = () => {
if (mediaRecorderRef.current) {
mediaRecorderRef.current.stop();
mediaRecorderRef.current = null;
}
if (socketRef.current) {
socketRef.current.close();
socketRef.current = null;
}
setIsRecording(false);

// Finalize the transcript  
if (transcriptRef.current) {  
  onTranscriptFinalized(transcriptRef.current);  
}  
transcriptRef.current \= '';  

};

const startRecording = async () => {
transcriptRef.current = ''; // Reset transcript
try {
// 1. Get the temporary Deepgram key
const response = await fetch('/api/voice-token', { method: 'POST' });
const { key, error } = await response.json();

  if (error) {  
    throw new Error(error);  
  }

  // 2\. Access the microphone  
  const stream \= await navigator.mediaDevices.getUserMedia({ audio: true });

  // 3\. Open direct WebSocket to Deepgram  
  const socket \= new WebSocket(  
    'wss://api.deepgram.com/v1/listen?interim\_results=true\&punctuate=true',  
    \['token', key\]  
  );  
  socketRef.current \= socket;

  socket.onopen \= () \=\> {  
    // 4\. Create MediaRecorder  
    const mediaRecorder \= new MediaRecorder(stream, {  
      mimeType: 'audio/webm',  
    });  
    mediaRecorderRef.current \= mediaRecorder;

    // 5\. Send audio chunks on data available  
    mediaRecorder.ondataavailable \= (event) \=\> {  
      if (event.data.size \> 0 && socket.readyState \=== WebSocket.OPEN) {  
        socket.send(event.data);  
      }  
    };

    // Start recording and chunking audio every 250ms   
    mediaRecorder.start(250);  
    setIsRecording(true);  
  };

  // 6\. Receive transcripts  
  socket.onmessage \= (event) \=\> {  
    const data \= JSON.parse(event.data) as DeepgramTranscript;  
    const transcript \= data.channel.alternatives.transcript;

    if (transcript) {  
      transcriptRef.current \= transcript;  
      onTranscriptUpdate(transcript);  
    }  
      
    // If it's a "speech final" event, this utterance is done.  
    if (data.speech\_final) {  
      stopRecording();  
    }  
  };

  socket.onclose \= () \=\> {  
    // Clean up stream  
    stream.getTracks().forEach((track) \=\> track.stop());  
    if (isRecording) {  
      stopRecording(); // Ensure cleanup  
    }  
  };  
    
  socket.onerror \= (err) \=\> {  
    console.error('WebSocket error:', err);  
    stopRecording();  
  };

} catch (error) {  
  console.error('Error starting recording:', error);  
  setIsRecording(false);  
}  

};

const handleToggleRecord = () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
};

return (
<Tooltip label={isRecording? 'Stop Recording' : 'Start Recording'}>
<ActionIcon
onClick={handleToggleRecord}
size="lg"
radius="xl"
color={isRecording? 'red' : 'gray'}
variant="filled"
>
{isRecording? <IconMicrophoneOff /> : <IconMicrophone />}
</ActionIcon>
</Tooltip>
);
}

2. Update Chat UI (app/chat/page.tsx)

Update /app/chat/page.tsx to include the new component:

TypeScript

'use client';

//... (other imports)
import { MicrophoneRecorder } from '@/components/MicrophoneRecorder';

export default function ChatPage() {
//... (other hooks: router, viewport)

const {
messages,
input,
handleInputChange,
handleSubmit,
setInput, // Get the setInput setter from useChat
data,
isLoading,
} = useChat({
//... (rest of useChat config)
});

//... (useEffect for auto-scroll)

return (
<Container size="md" h="100vh" style={{ display: 'flex', flexDirection: 'column' }}>
{/*... (Title and ScrollArea)... */}

  \<form onSubmit={handleSubmit}\>  
    \<Paper withBorder p="sm" radius="xl" my="md"\>  
      \<Group\>  
        \<TextInput  
          value={input}  
          onChange={handleInputChange}  
          placeholder="Speak or type your thoughts..."  
          style={{ flex: 1 }}  
          variant="unstyled"  
          disabled={isLoading}  
        /\>  
          
        {/\* Add the Microphone Recorder Here \*/}  
        \<MicrophoneRecorder  
          onTranscriptUpdate={(transcript) \=\> {  
            // Update the input field in real-time  
            setInput(transcript);  
          }}  
          onTranscriptFinalized={(transcript) \=\> {  
            // Automatically submit the chat when speech is final  
            // We pass the final transcript in the options  
            handleSubmit(new Event('submit'), {  
              data: {  
                finalTranscript: transcript,  
              },  
            });  
          }}  
        /\>  
          
        \<Button type="submit" radius="xl" loading={isLoading}\>  
          Send  
        \</Button\>  
      \</Group\>  
    \</Paper\>  
  \</form\>  
\</Container\>  

);
}

Test Specification

1. Create Test File (tests/magnitude/09-voice.mag.ts)

Create a file at /tests/magnitude/09-voice.mag.ts:

TypeScript

import { test } from 'magnitude-test';

test('[Happy Path] User can record voice and see transcript', async (agent) => {
// Act: Go to chat page
await agent.act('Navigate to /chat');

// Check: Verify initial state
await agent.check('The chat input field is empty');
await agent.check('A "Start Recording" button is visible');

// Act: Click the record button
// We must mock the /api/voice-token response and the
// MediaDevices/WebSocket browser APIs.
await agent.act('Click the "Start Recording" button');

// Check: UI updates to recording state
await agent.check('A "Stop Recording" button is visible');

// Act: Simulate receiving a transcript from the (mocked) Deepgram WebSocket
await agent.act(
'Simulate an interim transcript "Hello world" from the Deepgram WebSocket'
);

// Check: The input field is updated
await agent.check('The chat input field contains "Hello world"');

// Act: Simulate a final transcript
await agent.act(
'Simulate a final transcript "Hello world." from the Deepgram WebSocket'
);

// Check: The "Stop Recording" button is gone
await agent.check('A "Start Recording" button is visible again');

// Check: The chat input is cleared (because it was submitted)
await agent.check('The chat input field is empty');

// Check: The finalized transcript appears as a user message
await agent.check('The message "Hello world." appears in the chat list');
});