app/lib/voice-machine.ts

/**
 * Voice Mode State Machine - Clean, Canonical Design
 *
 * This machine represents the voice conversation flow.
 * All logic is in the machine definition, not in React effects.
 */

import { setup, assign, fromPromise } from 'xstate';

interface VoiceContext {
  transcript: string;
  lastSpokenMessageId: string | null;
  error: string | null;
  audioUrl: string | null;
  aiText: string | null;
}

type VoiceEvent =
  | { type: 'START_VOICE' }
  | { type: 'STOP_VOICE' }
  | { type: 'START_LISTENING' }
  | { type: 'USER_STARTED_SPEAKING' }
  | { type: 'FINALIZED_PHRASE'; phrase: string }
  | { type: 'UTTERANCE_END' }
  | { type: 'SILENCE_TIMEOUT' }
  | { type: 'USER_MESSAGE_SUBMITTED' }
  | { type: 'AI_RESPONSE_RECEIVED'; messageId: string; text: string }
  | { type: 'TTS_GENERATION_COMPLETE'; audioUrl: string }
  | { type: 'TTS_PLAYBACK_STARTED' }
  | { type: 'TTS_PLAYBACK_FINISHED' }
  | { type: 'SKIP_AUDIO' }
  | { type: 'ERROR'; message: string };

export const voiceMachine = setup({
  types: {
    context: {} as VoiceContext,
    events: {} as VoiceEvent,
  },
  actions: {
    setTranscript: assign({
      transcript: ({ event }) =>
        event.type === 'FINALIZED_PHRASE' ? event.phrase : '',
    }),
    appendPhrase: assign({
      transcript: ({ context, event }) =>
        event.type === 'FINALIZED_PHRASE'
          ? context.transcript + (context.transcript ? ' ' : '') + event.phrase
          : context.transcript,
    }),
    clearTranscript: assign({
      transcript: '',
    }),
    setLastSpoken: assign({
      lastSpokenMessageId: ({ event }) =>
        event.type === 'AI_RESPONSE_RECEIVED' ? event.messageId : null,
      aiText: ({ event }) =>
        event.type === 'AI_RESPONSE_RECEIVED' ? event.text : null,
    }),
    setAudioUrl: assign({
      audioUrl: ({ event }) =>
        event.type === 'TTS_GENERATION_COMPLETE' ? event.audioUrl : null,
    }),
    clearAudio: assign({
      audioUrl: null,
      aiText: null,
    }),
    setError: assign({
      error: ({ event }) => (event.type === 'ERROR' ? event.message : null),
    }),
    clearError: assign({
      error: null,
    }),
  },
}).createMachine({
  id: 'voice',
  initial: 'idle',
  context: {
    transcript: '',
    lastSpokenMessageId: null,
    error: null,
    audioUrl: null,
    aiText: null,
  },
  states: {
    idle: {
      tags: ['voiceIdle'],
      on: {
        START_VOICE: 'checkingForGreeting',
        STOP_VOICE: 'idle',
      },
    },

    checkingForGreeting: {
      tags: ['checking'],
      // This state checks if there's an unspoken AI message
      // In React, an effect will check messages and send appropriate event
      on: {
        AI_RESPONSE_RECEIVED: {
          target: 'generatingTTS',
          actions: 'setLastSpoken',
        },
        START_LISTENING: 'listening',
      },
    },

    listening: {
      tags: ['listening'],
      entry: ['clearTranscript', 'clearAudio'],
      on: {
        USER_STARTED_SPEAKING: 'userSpeaking',
        STOP_VOICE: 'idle',
      },
    },

    userSpeaking: {
      tags: ['userSpeaking'],
      on: {
        FINALIZED_PHRASE: {
          target: 'userSpeaking',
          actions: 'appendPhrase',
          reenter: true,
        },
        UTTERANCE_END: 'timingOut',
        STOP_VOICE: 'idle',
      },
    },

    timingOut: {
      tags: ['timingOut'],
      entry: () => console.log('[Voice Machine] Entered timingOut state, 3-second timer starting'),
      after: {
        3000: {
          target: 'submittingUser',
          actions: () => console.log('[Voice Machine] 3 seconds elapsed, transitioning to submittingUser'),
        },
      },
      on: {
        USER_STARTED_SPEAKING: 'userSpeaking', // User started talking again, cancel timeout
        // Don't handle FINALIZED_PHRASE here - just let the timer run
        STOP_VOICE: 'idle',
      },
    },

    submittingUser: {
      tags: ['submitting'],
      // React effect submits the transcript
      on: {
        USER_MESSAGE_SUBMITTED: 'waitingForAI',
        ERROR: {
          target: 'idle',
          actions: 'setError',
        },
        STOP_VOICE: 'idle',
      },
    },

    waitingForAI: {
      tags: ['waitingForAI'],
      // React effect polls/waits for AI response
      on: {
        AI_RESPONSE_RECEIVED: {
          target: 'generatingTTS',
          actions: 'setLastSpoken',
        },
        ERROR: {
          target: 'idle',
          actions: 'setError',
        },
        STOP_VOICE: 'idle',
      },
    },

    generatingTTS: {
      tags: ['aiGenerating', 'canSkipAudio'],
      // React effect generates TTS
      on: {
        TTS_GENERATION_COMPLETE: {
          target: 'playingTTS',
          actions: 'setAudioUrl',
        },
        SKIP_AUDIO: 'listening',
        ERROR: {
          target: 'listening',
          actions: 'setError',
        },
        STOP_VOICE: 'idle',
      },
    },

    playingTTS: {
      tags: ['aiSpeaking', 'canSkipAudio'],
      // React effect plays audio
      on: {
        TTS_PLAYBACK_FINISHED: 'listening',
        SKIP_AUDIO: 'listening',
        ERROR: {
          target: 'listening',
          actions: 'setError',
        },
        STOP_VOICE: 'idle',
      },
    },
  },
});