first commit
This commit is contained in:
261
agent-livekit/voice_handler.py
Normal file
261
agent-livekit/voice_handler.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Voice Handler for LiveKit Agent
|
||||
|
||||
This module handles speech recognition and text-to-speech functionality
|
||||
for the LiveKit Chrome automation agent.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import io
|
||||
import wave
|
||||
from typing import Optional, Dict, Any
|
||||
import numpy as np
|
||||
|
||||
from livekit import rtc
|
||||
from livekit.plugins import openai, deepgram
|
||||
|
||||
|
||||
class VoiceHandler:
|
||||
"""Handles voice recognition and synthesis for the LiveKit agent"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
self.config = config or {}
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Speech recognition settings
|
||||
self.stt_provider = self.config.get('speech', {}).get('provider', 'openai')
|
||||
self.language = self.config.get('speech', {}).get('language', 'en-US')
|
||||
self.confidence_threshold = self.config.get('speech', {}).get('confidence_threshold', 0.7)
|
||||
|
||||
# Text-to-speech settings
|
||||
self.tts_provider = self.config.get('tts', {}).get('provider', 'openai')
|
||||
self.voice = self.config.get('tts', {}).get('voice', 'alloy')
|
||||
self.speed = self.config.get('tts', {}).get('speed', 1.0)
|
||||
|
||||
# Audio processing
|
||||
self.sample_rate = 16000
|
||||
self.channels = 1
|
||||
self.chunk_size = 1024
|
||||
|
||||
# Components
|
||||
self.stt_engine = None
|
||||
self.tts_engine = None
|
||||
self.audio_buffer = []
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize speech recognition and synthesis engines"""
|
||||
try:
|
||||
# Check if OpenAI API key is available
|
||||
import os
|
||||
openai_key = os.getenv('OPENAI_API_KEY')
|
||||
|
||||
# Initialize STT engine
|
||||
if self.stt_provider == 'openai' and openai_key:
|
||||
self.stt_engine = openai.STT(
|
||||
language=self.language,
|
||||
detect_language=True
|
||||
)
|
||||
elif self.stt_provider == 'deepgram':
|
||||
self.stt_engine = deepgram.STT(
|
||||
language=self.language,
|
||||
model="nova-2"
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"STT provider {self.stt_provider} not available or API key missing")
|
||||
|
||||
# Initialize TTS engine
|
||||
if self.tts_provider == 'openai' and openai_key:
|
||||
self.tts_engine = openai.TTS(
|
||||
voice=self.voice,
|
||||
speed=self.speed
|
||||
)
|
||||
else:
|
||||
self.logger.warning(f"TTS provider {self.tts_provider} not available or API key missing")
|
||||
|
||||
self.logger.info(f"Voice handler initialized with STT: {self.stt_provider}, TTS: {self.tts_provider}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Voice handler initialization failed (this is expected without API keys): {e}")
|
||||
# Don't raise the exception, just log it
|
||||
|
||||
async def process_audio_frame(self, frame: rtc.AudioFrame) -> Optional[str]:
|
||||
"""Process an audio frame and return recognized text"""
|
||||
try:
|
||||
# Convert frame to numpy array
|
||||
audio_data = np.frombuffer(frame.data, dtype=np.int16)
|
||||
|
||||
# Add to buffer
|
||||
self.audio_buffer.extend(audio_data)
|
||||
|
||||
# Process when we have enough data (e.g., 1 second of audio)
|
||||
if len(self.audio_buffer) >= self.sample_rate:
|
||||
text = await self._recognize_speech(self.audio_buffer)
|
||||
self.audio_buffer = [] # Clear buffer
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing audio frame: {e}")
|
||||
|
||||
return None
|
||||
|
||||
async def _recognize_speech(self, audio_data: list) -> Optional[str]:
|
||||
"""Recognize speech from audio data"""
|
||||
try:
|
||||
if not self.stt_engine:
|
||||
return None
|
||||
|
||||
# Convert to audio format expected by STT engine
|
||||
audio_array = np.array(audio_data, dtype=np.int16)
|
||||
|
||||
# Create audio stream
|
||||
stream = self._create_audio_stream(audio_array)
|
||||
|
||||
# Recognize speech
|
||||
if self.stt_provider == 'openai':
|
||||
result = await self.stt_engine.recognize(stream)
|
||||
elif self.stt_provider == 'deepgram':
|
||||
result = await self.stt_engine.recognize(stream)
|
||||
else:
|
||||
return None
|
||||
|
||||
# Check confidence and return text
|
||||
if hasattr(result, 'confidence') and result.confidence < self.confidence_threshold:
|
||||
return None
|
||||
|
||||
text = result.text.strip() if hasattr(result, 'text') else str(result).strip()
|
||||
|
||||
if text:
|
||||
self.logger.info(f"Recognized speech: {text}")
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error recognizing speech: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _create_audio_stream(self, audio_data: np.ndarray) -> io.BytesIO:
|
||||
"""Create an audio stream from numpy array"""
|
||||
# Convert to bytes
|
||||
audio_bytes = audio_data.tobytes()
|
||||
|
||||
# Create WAV file in memory
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, 'wb') as wav_file:
|
||||
wav_file.setnchannels(self.channels)
|
||||
wav_file.setsampwidth(2) # 16-bit
|
||||
wav_file.setframerate(self.sample_rate)
|
||||
wav_file.writeframes(audio_bytes)
|
||||
|
||||
wav_buffer.seek(0)
|
||||
return wav_buffer
|
||||
|
||||
async def speak_response(self, text: str, room: Optional[rtc.Room] = None) -> bool:
|
||||
"""Convert text to speech and play it"""
|
||||
try:
|
||||
if not self.tts_engine:
|
||||
self.logger.warning("TTS engine not initialized")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Speaking: {text}")
|
||||
|
||||
# Generate speech
|
||||
if self.tts_provider == 'openai':
|
||||
audio_stream = await self.tts_engine.synthesize(text)
|
||||
else:
|
||||
return False
|
||||
|
||||
# If room is provided, publish audio track
|
||||
if room:
|
||||
await self._publish_audio_track(room, audio_stream)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error speaking response: {e}")
|
||||
return False
|
||||
|
||||
async def provide_action_feedback(self, action: str, result: str, room: Optional[rtc.Room] = None) -> bool:
|
||||
"""Provide immediate voice feedback about automation actions"""
|
||||
try:
|
||||
# Create concise feedback based on action type
|
||||
feedback_text = self._generate_action_feedback(action, result)
|
||||
|
||||
if feedback_text:
|
||||
return await self.speak_response(feedback_text, room)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error providing action feedback: {e}")
|
||||
return False
|
||||
|
||||
def _generate_action_feedback(self, action: str, result: str) -> str:
|
||||
"""Generate concise feedback text for different actions"""
|
||||
try:
|
||||
# Parse result to determine success/failure
|
||||
success = "success" in result.lower() or "clicked" in result.lower() or "filled" in result.lower()
|
||||
|
||||
if action == "click":
|
||||
return "Clicked" if success else "Click failed"
|
||||
elif action == "fill":
|
||||
return "Field filled" if success else "Fill failed"
|
||||
elif action == "navigate":
|
||||
return "Navigated" if success else "Navigation failed"
|
||||
elif action == "search":
|
||||
return "Search completed" if success else "Search failed"
|
||||
elif action == "type":
|
||||
return "Text entered" if success else "Text entry failed"
|
||||
else:
|
||||
return "Action completed" if success else "Action failed"
|
||||
|
||||
except Exception:
|
||||
return "Action processed"
|
||||
|
||||
async def _publish_audio_track(self, room: rtc.Room, audio_stream):
|
||||
"""Publish audio track to the room"""
|
||||
try:
|
||||
# Create audio source
|
||||
source = rtc.AudioSource(self.sample_rate, self.channels)
|
||||
track = rtc.LocalAudioTrack.create_audio_track("agent-voice", source)
|
||||
|
||||
# Publish track
|
||||
options = rtc.TrackPublishOptions()
|
||||
options.source = rtc.TrackSource.SOURCE_MICROPHONE
|
||||
|
||||
publication = await room.local_participant.publish_track(track, options)
|
||||
|
||||
# Stream audio data
|
||||
async for frame in audio_stream:
|
||||
await source.capture_frame(frame)
|
||||
|
||||
# Unpublish when done
|
||||
await room.local_participant.unpublish_track(publication.sid)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error publishing audio track: {e}")
|
||||
|
||||
async def set_language(self, language: str):
|
||||
"""Change the recognition language"""
|
||||
self.language = language
|
||||
# Reinitialize STT engine with new language
|
||||
await self.initialize()
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
"""Change the TTS voice"""
|
||||
self.voice = voice
|
||||
# Reinitialize TTS engine with new voice
|
||||
await self.initialize()
|
||||
|
||||
def get_supported_languages(self) -> list:
|
||||
"""Get list of supported languages"""
|
||||
return [
|
||||
'en-US', 'en-GB', 'es-ES', 'fr-FR', 'de-DE',
|
||||
'it-IT', 'pt-BR', 'ru-RU', 'ja-JP', 'ko-KR', 'zh-CN'
|
||||
]
|
||||
|
||||
def get_supported_voices(self) -> list:
|
||||
"""Get list of supported voices"""
|
||||
if self.tts_provider == 'openai':
|
||||
return ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
|
||||
return []
|
Reference in New Issue
Block a user