init

2025-11-08 12:44:39 +00:00
commit e867e626fe
14 changed files with 2817 additions and 0 deletions
--- a/docs/steps/step-09.md
+++ b/docs/steps/step-09.md
@@ -0,0 +1,296 @@
+# **File: COMMIT\_09\_VOICE\_CLIENT.md**
+
+## **Commit 9: Real-time Voice: Client Integration**
+
+### **Objective**
+
+Integrate the client-side microphone recording. This component will:
+
+1. Call the /api/voice-token route (from Commit 08\) to get a temporary key.  
+2. Use navigator.mediaDevices.getUserMedia to access the microphone.24  
+3. Open a *direct* WebSocket to Deepgram using the temporary key.25  
+4. Use MediaRecorder to capture audio chunks.24  
+5. Stream audio chunks to Deepgram and receive transcripts back in real-time.  
+6. Feed the received transcripts into the useChat input.
+
+### **Implementation Specification**
+
+**1\. Create components/MicrophoneRecorder.tsx**
+
+Create a new component at /components/MicrophoneRecorder.tsx:
+
+TypeScript
+
+'use client';
+
+import { ActionIcon, Tooltip } from '@mantine/core';  
+import { IconMicrophone, IconMicrophoneOff } from '@tabler/icons-react';  
+import { useState, useRef } from 'react';
+
+// Define the shape of the Deepgram transcript  
+interface DeepgramTranscript {  
+  channel: {  
+    alternatives: {  
+      transcript: string;  
+    };  
+  };  
+  is\_final: boolean;  
+  speech\_final: boolean;  
+}
+
+type Props \= {  
+  /\*\*  
+   \* Callback function to update the chat input with the new transcript.  
+   \* @param transcript The full, combined transcript.  
+   \*/  
+  onTranscriptUpdate: (transcript: string) \=\> void;  
+  /\*\*  
+   \* Callback function to signal the final transcript for this "thought".  
+   \* @param transcript The final, punctuated transcript.  
+   \*/  
+  onTranscriptFinalized: (transcript: string) \=\> void;  
+};
+
+export function MicrophoneRecorder({ onTranscriptUpdate, onTranscriptFinalized }: Props) {  
+  const \= useState(false);  
+  const mediaRecorderRef \= useRef\<MediaRecorder | null\>(null);  
+  const socketRef \= useRef\<WebSocket | null\>(null);  
+    
+  // Store the combined transcript for the current utterance  
+  const transcriptRef \= useRef\<string\>('');
+
+  const stopRecording \= () \=\> {  
+    if (mediaRecorderRef.current) {  
+      mediaRecorderRef.current.stop();  
+      mediaRecorderRef.current \= null;  
+    }  
+    if (socketRef.current) {  
+      socketRef.current.close();  
+      socketRef.current \= null;  
+    }  
+    setIsRecording(false);  
+      
+    // Finalize the transcript  
+    if (transcriptRef.current) {  
+      onTranscriptFinalized(transcriptRef.current);  
+    }  
+    transcriptRef.current \= '';  
+  };
+
+  const startRecording \= async () \=\> {  
+    transcriptRef.current \= ''; // Reset transcript  
+    try {  
+      // 1\. Get the temporary Deepgram key  
+      const response \= await fetch('/api/voice-token', { method: 'POST' });  
+      const { key, error } \= await response.json();
+
+      if (error) {  
+        throw new Error(error);  
+      }
+
+      // 2\. Access the microphone  
+      const stream \= await navigator.mediaDevices.getUserMedia({ audio: true });
+
+      // 3\. Open direct WebSocket to Deepgram  
+      const socket \= new WebSocket(  
+        'wss://api.deepgram.com/v1/listen?interim\_results=true\&punctuate=true',  
+        \['token', key\]  
+      );  
+      socketRef.current \= socket;
+
+      socket.onopen \= () \=\> {  
+        // 4\. Create MediaRecorder  
+        const mediaRecorder \= new MediaRecorder(stream, {  
+          mimeType: 'audio/webm',  
+        });  
+        mediaRecorderRef.current \= mediaRecorder;
+
+        // 5\. Send audio chunks on data available  
+        mediaRecorder.ondataavailable \= (event) \=\> {  
+          if (event.data.size \> 0 && socket.readyState \=== WebSocket.OPEN) {  
+            socket.send(event.data);  
+          }  
+        };
+
+        // Start recording and chunking audio every 250ms   
+        mediaRecorder.start(250);  
+        setIsRecording(true);  
+      };
+
+      // 6\. Receive transcripts  
+      socket.onmessage \= (event) \=\> {  
+        const data \= JSON.parse(event.data) as DeepgramTranscript;  
+        const transcript \= data.channel.alternatives.transcript;
+
+        if (transcript) {  
+          transcriptRef.current \= transcript;  
+          onTranscriptUpdate(transcript);  
+        }  
+          
+        // If it's a "speech final" event, this utterance is done.  
+        if (data.speech\_final) {  
+          stopRecording();  
+        }  
+      };
+
+      socket.onclose \= () \=\> {  
+        // Clean up stream  
+        stream.getTracks().forEach((track) \=\> track.stop());  
+        if (isRecording) {  
+          stopRecording(); // Ensure cleanup  
+        }  
+      };  
+        
+      socket.onerror \= (err) \=\> {  
+        console.error('WebSocket error:', err);  
+        stopRecording();  
+      };
+
+    } catch (error) {  
+      console.error('Error starting recording:', error);  
+      setIsRecording(false);  
+    }  
+  };
+
+  const handleToggleRecord \= () \=\> {  
+    if (isRecording) {  
+      stopRecording();  
+    } else {  
+      startRecording();  
+    }  
+  };
+
+  return (  
+    \<Tooltip label={isRecording? 'Stop Recording' : 'Start Recording'}\>  
+      \<ActionIcon  
+        onClick={handleToggleRecord}  
+        size="lg"  
+        radius="xl"  
+        color={isRecording? 'red' : 'gray'}  
+        variant="filled"  
+      \>  
+        {isRecording? \<IconMicrophoneOff /\> : \<IconMicrophone /\>}  
+      \</ActionIcon\>  
+    \</Tooltip\>  
+  );  
+}
+
+**2\. Update Chat UI (app/chat/page.tsx)**
+
+Update /app/chat/page.tsx to include the new component:
+
+TypeScript
+
+'use client';
+
+//... (other imports)  
+import { MicrophoneRecorder } from '@/components/MicrophoneRecorder';
+
+export default function ChatPage() {  
+  //... (other hooks: router, viewport)
+
+  const {  
+    messages,  
+    input,  
+    handleInputChange,  
+    handleSubmit,  
+    setInput, // Get the setInput setter from useChat  
+    data,  
+    isLoading,  
+  } \= useChat({  
+    //... (rest of useChat config)  
+  });
+
+  //... (useEffect for auto-scroll)
+
+  return (  
+    \<Container size="md" h="100vh" style={{ display: 'flex', flexDirection: 'column' }}\>  
+      {/\*... (Title and ScrollArea)... \*/}
+
+      \<form onSubmit={handleSubmit}\>  
+        \<Paper withBorder p="sm" radius="xl" my="md"\>  
+          \<Group\>  
+            \<TextInput  
+              value={input}  
+              onChange={handleInputChange}  
+              placeholder="Speak or type your thoughts..."  
+              style={{ flex: 1 }}  
+              variant="unstyled"  
+              disabled={isLoading}  
+            /\>  
+              
+            {/\* Add the Microphone Recorder Here \*/}  
+            \<MicrophoneRecorder  
+              onTranscriptUpdate={(transcript) \=\> {  
+                // Update the input field in real-time  
+                setInput(transcript);  
+              }}  
+              onTranscriptFinalized={(transcript) \=\> {  
+                // Automatically submit the chat when speech is final  
+                // We pass the final transcript in the options  
+                handleSubmit(new Event('submit'), {  
+                  data: {  
+                    finalTranscript: transcript,  
+                  },  
+                });  
+              }}  
+            /\>  
+              
+            \<Button type="submit" radius="xl" loading={isLoading}\>  
+              Send  
+            \</Button\>  
+          \</Group\>  
+        \</Paper\>  
+      \</form\>  
+    \</Container\>  
+  );  
+}
+
+### **Test Specification**
+
+**1\. Create Test File (tests/magnitude/09-voice.mag.ts)**
+
+Create a file at /tests/magnitude/09-voice.mag.ts:
+
+TypeScript
+
+import { test } from 'magnitude-test';
+
+test('\[Happy Path\] User can record voice and see transcript', async (agent) \=\> {  
+  // Act: Go to chat page  
+  await agent.act('Navigate to /chat');
+
+  // Check: Verify initial state  
+  await agent.check('The chat input field is empty');  
+  await agent.check('A "Start Recording" button is visible');
+
+  // Act: Click the record button  
+  // We must mock the /api/voice-token response and the  
+  // MediaDevices/WebSocket browser APIs.  
+  await agent.act('Click the "Start Recording" button');
+
+  // Check: UI updates to recording state  
+  await agent.check('A "Stop Recording" button is visible');
+
+  // Act: Simulate receiving a transcript from the (mocked) Deepgram WebSocket  
+  await agent.act(  
+    'Simulate an interim transcript "Hello world" from the Deepgram WebSocket'  
+  );
+
+  // Check: The input field is updated  
+  await agent.check('The chat input field contains "Hello world"');
+
+  // Act: Simulate a final transcript  
+  await agent.act(  
+    'Simulate a final transcript "Hello world." from the Deepgram WebSocket'  
+  );  
+    
+  // Check: The "Stop Recording" button is gone  
+  await agent.check('A "Start Recording" button is visible again');  
+    
+  // Check: The chat input is cleared (because it was submitted)  
+  await agent.check('The chat input field is empty');
+
+  // Check: The finalized transcript appears as a user message  
+  await agent.check('The message "Hello world." appears in the chat list');  
+});