init
This commit is contained in:
296
docs/steps/step-09.md
Normal file
296
docs/steps/step-09.md
Normal file
@@ -0,0 +1,296 @@
|
||||
# **File: COMMIT\_09\_VOICE\_CLIENT.md**
|
||||
|
||||
## **Commit 9: Real-time Voice: Client Integration**
|
||||
|
||||
### **Objective**
|
||||
|
||||
Integrate the client-side microphone recording. This component will:
|
||||
|
||||
1. Call the /api/voice-token route (from Commit 08\) to get a temporary key.
|
||||
2. Use navigator.mediaDevices.getUserMedia to access the microphone.24
|
||||
3. Open a *direct* WebSocket to Deepgram using the temporary key.25
|
||||
4. Use MediaRecorder to capture audio chunks.24
|
||||
5. Stream audio chunks to Deepgram and receive transcripts back in real-time.
|
||||
6. Feed the received transcripts into the useChat input.
|
||||
|
||||
### **Implementation Specification**
|
||||
|
||||
**1\. Create components/MicrophoneRecorder.tsx**
|
||||
|
||||
Create a new component at /components/MicrophoneRecorder.tsx:
|
||||
|
||||
TypeScript
|
||||
|
||||
'use client';
|
||||
|
||||
import { ActionIcon, Tooltip } from '@mantine/core';
|
||||
import { IconMicrophone, IconMicrophoneOff } from '@tabler/icons-react';
|
||||
import { useState, useRef } from 'react';
|
||||
|
||||
// Define the shape of the Deepgram transcript
|
||||
interface DeepgramTranscript {
|
||||
channel: {
|
||||
alternatives: {
|
||||
transcript: string;
|
||||
};
|
||||
};
|
||||
is\_final: boolean;
|
||||
speech\_final: boolean;
|
||||
}
|
||||
|
||||
type Props \= {
|
||||
/\*\*
|
||||
\* Callback function to update the chat input with the new transcript.
|
||||
\* @param transcript The full, combined transcript.
|
||||
\*/
|
||||
onTranscriptUpdate: (transcript: string) \=\> void;
|
||||
/\*\*
|
||||
\* Callback function to signal the final transcript for this "thought".
|
||||
\* @param transcript The final, punctuated transcript.
|
||||
\*/
|
||||
onTranscriptFinalized: (transcript: string) \=\> void;
|
||||
};
|
||||
|
||||
export function MicrophoneRecorder({ onTranscriptUpdate, onTranscriptFinalized }: Props) {
|
||||
const \= useState(false);
|
||||
const mediaRecorderRef \= useRef\<MediaRecorder | null\>(null);
|
||||
const socketRef \= useRef\<WebSocket | null\>(null);
|
||||
|
||||
// Store the combined transcript for the current utterance
|
||||
const transcriptRef \= useRef\<string\>('');
|
||||
|
||||
const stopRecording \= () \=\> {
|
||||
if (mediaRecorderRef.current) {
|
||||
mediaRecorderRef.current.stop();
|
||||
mediaRecorderRef.current \= null;
|
||||
}
|
||||
if (socketRef.current) {
|
||||
socketRef.current.close();
|
||||
socketRef.current \= null;
|
||||
}
|
||||
setIsRecording(false);
|
||||
|
||||
// Finalize the transcript
|
||||
if (transcriptRef.current) {
|
||||
onTranscriptFinalized(transcriptRef.current);
|
||||
}
|
||||
transcriptRef.current \= '';
|
||||
};
|
||||
|
||||
const startRecording \= async () \=\> {
|
||||
transcriptRef.current \= ''; // Reset transcript
|
||||
try {
|
||||
// 1\. Get the temporary Deepgram key
|
||||
const response \= await fetch('/api/voice-token', { method: 'POST' });
|
||||
const { key, error } \= await response.json();
|
||||
|
||||
if (error) {
|
||||
throw new Error(error);
|
||||
}
|
||||
|
||||
// 2\. Access the microphone
|
||||
const stream \= await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
|
||||
// 3\. Open direct WebSocket to Deepgram
|
||||
const socket \= new WebSocket(
|
||||
'wss://api.deepgram.com/v1/listen?interim\_results=true\&punctuate=true',
|
||||
\['token', key\]
|
||||
);
|
||||
socketRef.current \= socket;
|
||||
|
||||
socket.onopen \= () \=\> {
|
||||
// 4\. Create MediaRecorder
|
||||
const mediaRecorder \= new MediaRecorder(stream, {
|
||||
mimeType: 'audio/webm',
|
||||
});
|
||||
mediaRecorderRef.current \= mediaRecorder;
|
||||
|
||||
// 5\. Send audio chunks on data available
|
||||
mediaRecorder.ondataavailable \= (event) \=\> {
|
||||
if (event.data.size \> 0 && socket.readyState \=== WebSocket.OPEN) {
|
||||
socket.send(event.data);
|
||||
}
|
||||
};
|
||||
|
||||
// Start recording and chunking audio every 250ms
|
||||
mediaRecorder.start(250);
|
||||
setIsRecording(true);
|
||||
};
|
||||
|
||||
// 6\. Receive transcripts
|
||||
socket.onmessage \= (event) \=\> {
|
||||
const data \= JSON.parse(event.data) as DeepgramTranscript;
|
||||
const transcript \= data.channel.alternatives.transcript;
|
||||
|
||||
if (transcript) {
|
||||
transcriptRef.current \= transcript;
|
||||
onTranscriptUpdate(transcript);
|
||||
}
|
||||
|
||||
// If it's a "speech final" event, this utterance is done.
|
||||
if (data.speech\_final) {
|
||||
stopRecording();
|
||||
}
|
||||
};
|
||||
|
||||
socket.onclose \= () \=\> {
|
||||
// Clean up stream
|
||||
stream.getTracks().forEach((track) \=\> track.stop());
|
||||
if (isRecording) {
|
||||
stopRecording(); // Ensure cleanup
|
||||
}
|
||||
};
|
||||
|
||||
socket.onerror \= (err) \=\> {
|
||||
console.error('WebSocket error:', err);
|
||||
stopRecording();
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error starting recording:', error);
|
||||
setIsRecording(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleToggleRecord \= () \=\> {
|
||||
if (isRecording) {
|
||||
stopRecording();
|
||||
} else {
|
||||
startRecording();
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
\<Tooltip label={isRecording? 'Stop Recording' : 'Start Recording'}\>
|
||||
\<ActionIcon
|
||||
onClick={handleToggleRecord}
|
||||
size="lg"
|
||||
radius="xl"
|
||||
color={isRecording? 'red' : 'gray'}
|
||||
variant="filled"
|
||||
\>
|
||||
{isRecording? \<IconMicrophoneOff /\> : \<IconMicrophone /\>}
|
||||
\</ActionIcon\>
|
||||
\</Tooltip\>
|
||||
);
|
||||
}
|
||||
|
||||
**2\. Update Chat UI (app/chat/page.tsx)**
|
||||
|
||||
Update /app/chat/page.tsx to include the new component:
|
||||
|
||||
TypeScript
|
||||
|
||||
'use client';
|
||||
|
||||
//... (other imports)
|
||||
import { MicrophoneRecorder } from '@/components/MicrophoneRecorder';
|
||||
|
||||
export default function ChatPage() {
|
||||
//... (other hooks: router, viewport)
|
||||
|
||||
const {
|
||||
messages,
|
||||
input,
|
||||
handleInputChange,
|
||||
handleSubmit,
|
||||
setInput, // Get the setInput setter from useChat
|
||||
data,
|
||||
isLoading,
|
||||
} \= useChat({
|
||||
//... (rest of useChat config)
|
||||
});
|
||||
|
||||
//... (useEffect for auto-scroll)
|
||||
|
||||
return (
|
||||
\<Container size="md" h="100vh" style={{ display: 'flex', flexDirection: 'column' }}\>
|
||||
{/\*... (Title and ScrollArea)... \*/}
|
||||
|
||||
\<form onSubmit={handleSubmit}\>
|
||||
\<Paper withBorder p="sm" radius="xl" my="md"\>
|
||||
\<Group\>
|
||||
\<TextInput
|
||||
value={input}
|
||||
onChange={handleInputChange}
|
||||
placeholder="Speak or type your thoughts..."
|
||||
style={{ flex: 1 }}
|
||||
variant="unstyled"
|
||||
disabled={isLoading}
|
||||
/\>
|
||||
|
||||
{/\* Add the Microphone Recorder Here \*/}
|
||||
\<MicrophoneRecorder
|
||||
onTranscriptUpdate={(transcript) \=\> {
|
||||
// Update the input field in real-time
|
||||
setInput(transcript);
|
||||
}}
|
||||
onTranscriptFinalized={(transcript) \=\> {
|
||||
// Automatically submit the chat when speech is final
|
||||
// We pass the final transcript in the options
|
||||
handleSubmit(new Event('submit'), {
|
||||
data: {
|
||||
finalTranscript: transcript,
|
||||
},
|
||||
});
|
||||
}}
|
||||
/\>
|
||||
|
||||
\<Button type="submit" radius="xl" loading={isLoading}\>
|
||||
Send
|
||||
\</Button\>
|
||||
\</Group\>
|
||||
\</Paper\>
|
||||
\</form\>
|
||||
\</Container\>
|
||||
);
|
||||
}
|
||||
|
||||
### **Test Specification**
|
||||
|
||||
**1\. Create Test File (tests/magnitude/09-voice.mag.ts)**
|
||||
|
||||
Create a file at /tests/magnitude/09-voice.mag.ts:
|
||||
|
||||
TypeScript
|
||||
|
||||
import { test } from 'magnitude-test';
|
||||
|
||||
test('\[Happy Path\] User can record voice and see transcript', async (agent) \=\> {
|
||||
// Act: Go to chat page
|
||||
await agent.act('Navigate to /chat');
|
||||
|
||||
// Check: Verify initial state
|
||||
await agent.check('The chat input field is empty');
|
||||
await agent.check('A "Start Recording" button is visible');
|
||||
|
||||
// Act: Click the record button
|
||||
// We must mock the /api/voice-token response and the
|
||||
// MediaDevices/WebSocket browser APIs.
|
||||
await agent.act('Click the "Start Recording" button');
|
||||
|
||||
// Check: UI updates to recording state
|
||||
await agent.check('A "Stop Recording" button is visible');
|
||||
|
||||
// Act: Simulate receiving a transcript from the (mocked) Deepgram WebSocket
|
||||
await agent.act(
|
||||
'Simulate an interim transcript "Hello world" from the Deepgram WebSocket'
|
||||
);
|
||||
|
||||
// Check: The input field is updated
|
||||
await agent.check('The chat input field contains "Hello world"');
|
||||
|
||||
// Act: Simulate a final transcript
|
||||
await agent.act(
|
||||
'Simulate a final transcript "Hello world." from the Deepgram WebSocket'
|
||||
);
|
||||
|
||||
// Check: The "Stop Recording" button is gone
|
||||
await agent.check('A "Start Recording" button is visible again');
|
||||
|
||||
// Check: The chat input is cleared (because it was submitted)
|
||||
await agent.check('The chat input field is empty');
|
||||
|
||||
// Check: The finalized transcript appears as a user message
|
||||
await agent.check('The message "Hello world." appears in the chat list');
|
||||
});
|
||||
Reference in New Issue
Block a user