1020 lines
46 KiB
Python
1020 lines
46 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LiveKit Agent for MCP Chrome Bridge Integration
|
|
|
|
This agent provides real-time audio/video communication with Chrome automation capabilities.
|
|
|
|
For detailed information about MCP tool response handling, see:
|
|
docs/MCP_RESPONSE_HANDLING.md
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import yaml
|
|
import asyncio
|
|
import re
|
|
import json
|
|
from typing import Optional
|
|
from dataclasses import dataclass
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
from livekit import rtc
|
|
from livekit.agents import (
|
|
Agent,
|
|
AgentSession,
|
|
JobContext,
|
|
WorkerOptions,
|
|
cli,
|
|
function_tool,
|
|
RunContext
|
|
)
|
|
from livekit.plugins import openai, deepgram, silero
|
|
|
|
from mcp_chrome_client import MCPChromeClient
|
|
from screen_share import ScreenShareHandler
|
|
from debug_utils import SelectorDebugger, BrowserStateMonitor
|
|
|
|
|
|
@dataclass
|
|
class AgentConfig:
|
|
"""Configuration for the LiveKit agent"""
|
|
livekit_url: str
|
|
api_key: str
|
|
api_secret: str
|
|
room_name: str
|
|
agent_name: str
|
|
mcp_server_type: str
|
|
mcp_server_url: str
|
|
mcp_server_command: str
|
|
mcp_server_args: list
|
|
browser_profile: str
|
|
|
|
|
|
class LiveKitChromeAgent:
|
|
"""Main LiveKit agent class for Chrome automation"""
|
|
|
|
def __init__(self, config: AgentConfig):
|
|
self.config = config
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Initialize components
|
|
chrome_config = {
|
|
'mcp_server_type': config.mcp_server_type,
|
|
'mcp_server_url': config.mcp_server_url,
|
|
'mcp_server_command': config.mcp_server_command,
|
|
'mcp_server_args': config.mcp_server_args
|
|
}
|
|
self.mcp_client = MCPChromeClient(chrome_config)
|
|
self.screen_share = ScreenShareHandler()
|
|
|
|
# Debug utilities
|
|
self.selector_debugger = SelectorDebugger(self.mcp_client, self.logger)
|
|
self.browser_monitor = BrowserStateMonitor(self.mcp_client, self.logger)
|
|
|
|
# LiveKit components
|
|
self.room: Optional[rtc.Room] = None
|
|
self.participant: Optional[rtc.RemoteParticipant] = None
|
|
self.agent_session: Optional[AgentSession] = None
|
|
|
|
async def initialize(self):
|
|
"""Initialize the agent and its components"""
|
|
try:
|
|
await self.mcp_client.connect()
|
|
await self.screen_share.initialize()
|
|
self.logger.info("Agent initialized successfully")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to initialize agent: {e}")
|
|
raise
|
|
|
|
async def entrypoint(self, ctx: JobContext):
|
|
"""Main entry point for the LiveKit agent"""
|
|
self.logger.info(f"Starting agent for room: {ctx.room.name}")
|
|
|
|
# Connect to the room first
|
|
await ctx.connect()
|
|
|
|
# Initialize room and components
|
|
self.room = ctx.room
|
|
await self.initialize()
|
|
|
|
# Create Chrome automation tools
|
|
@function_tool
|
|
async def navigate_to_url(context: RunContext, url: str):
|
|
"""Navigate to a specific URL in the browser"""
|
|
try:
|
|
result = await self.mcp_client._navigate_mcp(url)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error navigating to {url}: {str(e)}"
|
|
|
|
@function_tool
|
|
async def go_to_google(context: RunContext):
|
|
"""Open Google in a new tab"""
|
|
try:
|
|
result = await self.mcp_client._go_to_google_mcp()
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error opening Google: {str(e)}"
|
|
|
|
@function_tool
|
|
async def go_to_facebook(context: RunContext):
|
|
"""Open Facebook in a new tab"""
|
|
try:
|
|
result = await self.mcp_client._go_to_facebook_mcp()
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error opening Facebook: {str(e)}"
|
|
|
|
@function_tool
|
|
async def go_to_twitter(context: RunContext):
|
|
"""Open Twitter/X in a new tab"""
|
|
try:
|
|
result = await self.mcp_client._go_to_twitter_mcp()
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error opening Twitter: {str(e)}"
|
|
|
|
@function_tool
|
|
async def search_google(context: RunContext, query: str):
|
|
"""Search for something on Google and return results"""
|
|
try:
|
|
result = await self.mcp_client._search_google_mcp(query)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error searching Google for '{query}': {str(e)}"
|
|
|
|
@function_tool
|
|
async def search_with_text_input(query: str, search_selector: str = "#APjFqb, textarea[name='q'], [role='combobox'], input[name='q']"):
|
|
"""Fill search input field with text and submit using Enter key"""
|
|
try:
|
|
# Try multiple selectors for better compatibility (updated for modern Google)
|
|
selectors_to_try = [
|
|
search_selector,
|
|
"#APjFqb", # Main Google search box ID
|
|
"textarea[name='q']", # Google search textarea
|
|
"[role='combobox']", # Role-based selector
|
|
".gLFyf", # Google search box class
|
|
"textarea[aria-label*='Search']", # Aria-label based
|
|
"input[name='q']", # Fallback for other sites
|
|
"input[type='search']",
|
|
"#search",
|
|
"[role='searchbox']",
|
|
"input[placeholder*='search' i]",
|
|
"input[aria-label*='search' i]"
|
|
]
|
|
|
|
click_result = None
|
|
for selector in selectors_to_try:
|
|
try:
|
|
click_result = await self.mcp_client.execute_voice_command(f"click {selector}")
|
|
self.logger.info(f"Successfully clicked selector: {selector}")
|
|
break
|
|
except Exception as e:
|
|
self.logger.debug(f"Failed to click selector {selector}: {e}")
|
|
continue
|
|
|
|
if not click_result:
|
|
return f"Error: Could not find any search input field to click"
|
|
|
|
self.logger.info(f"Click result: {click_result}")
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Clear any existing text and fill the search input field
|
|
clear_result = await self.mcp_client.execute_voice_command("keyboard ctrl+a") # Select all
|
|
self.logger.debug(f"Clear result: {clear_result}")
|
|
await asyncio.sleep(0.2)
|
|
|
|
type_result = await self.mcp_client.execute_voice_command(f"type {query}")
|
|
self.logger.info(f"Type result: {type_result}")
|
|
await asyncio.sleep(1)
|
|
|
|
# Press Enter to submit search
|
|
enter_result = await self.mcp_client.execute_voice_command("keyboard enter")
|
|
self.logger.info(f"Enter result: {enter_result}")
|
|
await asyncio.sleep(2) # Wait for search to process
|
|
|
|
await self.screen_share.update_screen()
|
|
return f"Search submitted with query: '{query}' using text input and Enter key. Results: Click={click_result}, Type={type_result}, Enter={enter_result}"
|
|
except Exception as e:
|
|
self.logger.error(f"Error in search_with_text_input: {e}")
|
|
return f"Error submitting search with text input: {str(e)}"
|
|
|
|
@function_tool
|
|
async def search_with_button_click(query: str, input_selector: str = "#APjFqb, textarea[name='q'], [role='combobox']", button_selector: str = "button[type='submit'], input[type='submit'], .search-button"):
|
|
"""Fill search input and click search button"""
|
|
try:
|
|
# Try multiple input selectors for better compatibility (updated for modern Google)
|
|
input_selectors_to_try = [
|
|
input_selector,
|
|
"#APjFqb", # Main Google search box ID
|
|
"textarea[name='q']", # Google search textarea
|
|
"[role='combobox']", # Role-based selector
|
|
".gLFyf", # Google search box class
|
|
"textarea[aria-label*='Search']", # Aria-label based
|
|
"input[name='q']", # Fallback for other sites
|
|
"textarea[name='q']",
|
|
"input[type='search']",
|
|
"#search",
|
|
"[role='searchbox']",
|
|
"input[placeholder*='search' i]",
|
|
"input[aria-label*='search' i]"
|
|
]
|
|
|
|
click_result = None
|
|
for selector in input_selectors_to_try:
|
|
try:
|
|
click_result = await self.mcp_client.execute_voice_command(f"click {selector}")
|
|
self.logger.info(f"Successfully clicked input selector: {selector}")
|
|
break
|
|
except Exception as e:
|
|
self.logger.debug(f"Failed to click input selector {selector}: {e}")
|
|
continue
|
|
|
|
if not click_result:
|
|
return f"Error: Could not find any search input field to click"
|
|
|
|
self.logger.info(f"Input click result: {click_result}")
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Clear any existing text and type new query
|
|
clear_result = await self.mcp_client.execute_voice_command("keyboard ctrl+a") # Select all
|
|
self.logger.debug(f"Clear result: {clear_result}")
|
|
await asyncio.sleep(0.2)
|
|
|
|
type_result = await self.mcp_client.execute_voice_command(f"type {query}")
|
|
self.logger.info(f"Type result: {type_result}")
|
|
await asyncio.sleep(1)
|
|
|
|
# Try multiple button selectors for better compatibility
|
|
button_selectors_to_try = [
|
|
button_selector,
|
|
"button[type='submit']",
|
|
"input[type='submit']",
|
|
"button[aria-label*='search' i]",
|
|
".search-button",
|
|
"[role='button'][aria-label*='search' i]",
|
|
"button:contains('Search')",
|
|
"input[value*='search' i]"
|
|
]
|
|
|
|
button_result = None
|
|
for selector in button_selectors_to_try:
|
|
try:
|
|
button_result = await self.mcp_client.execute_voice_command(f"click {selector}")
|
|
self.logger.info(f"Successfully clicked button selector: {selector}")
|
|
break
|
|
except Exception as e:
|
|
self.logger.debug(f"Failed to click button selector {selector}: {e}")
|
|
continue
|
|
|
|
if not button_result:
|
|
# Fallback to Enter key if no button found
|
|
self.logger.info("No search button found, falling back to Enter key")
|
|
button_result = await self.mcp_client.execute_voice_command("keyboard enter")
|
|
|
|
self.logger.info(f"Button click result: {button_result}")
|
|
await asyncio.sleep(2) # Wait for search to process
|
|
|
|
await self.screen_share.update_screen()
|
|
return f"Search button clicked with query: '{query}'. Results: Input={click_result}, Type={type_result}, Button={button_result}"
|
|
except Exception as e:
|
|
self.logger.error(f"Error in search_with_button_click: {e}")
|
|
return f"Error clicking search button: {str(e)}"
|
|
|
|
@function_tool
|
|
async def click_element(context: RunContext, selector: str):
|
|
"""Click on an element using CSS selector"""
|
|
try:
|
|
result = await self.mcp_client._click_mcp(selector)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error clicking element {selector}: {str(e)}"
|
|
|
|
@function_tool
|
|
async def type_text(context: RunContext, text: str):
|
|
"""Type text into the currently focused element"""
|
|
try:
|
|
result = await self.mcp_client._type_text_mcp(text)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error typing text: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_search_results(context: RunContext):
|
|
"""Extract and return current search results from the page"""
|
|
try:
|
|
result = await self.mcp_client._get_search_results_mcp()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting search results: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_form_fields(context: RunContext):
|
|
"""Get all form fields on the current page"""
|
|
try:
|
|
result = await self.mcp_client.get_form_fields()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting form fields: {str(e)}"
|
|
|
|
@function_tool
|
|
async def fill_form_field(context: RunContext, field_selector: str, value: str):
|
|
"""Fill a specific form field with a value using target element tracking"""
|
|
try:
|
|
# Use enhanced fill method that tracks target elements
|
|
result = await self.mcp_client.fill_input_field(field_selector, value)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error filling form field {field_selector}: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_form_field_info(context: RunContext, field_selector: str):
|
|
"""Get detailed information about a specific form field"""
|
|
try:
|
|
result = await self.mcp_client.get_form_field_info(field_selector)
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting form field info for {field_selector}: {str(e)}"
|
|
|
|
@function_tool
|
|
async def fill_form_step_by_step(context: RunContext, form_data: str):
|
|
"""Fill form fields one by one with provided data (JSON format)"""
|
|
try:
|
|
result = await self.mcp_client.fill_form_step_by_step(form_data)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error filling form step by step: {str(e)}"
|
|
|
|
@function_tool
|
|
async def fill_qubecare_login(context: RunContext, email: str, password: str):
|
|
"""Fill QuBeCare login form with email and password"""
|
|
try:
|
|
result = await self.mcp_client.fill_qubecare_login(email, password)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error filling QuBeCare login form: {str(e)}"
|
|
|
|
@function_tool
|
|
async def submit_form(context: RunContext, form_selector: str = "form"):
|
|
"""Submit a form on the current page"""
|
|
try:
|
|
result = await self.mcp_client.submit_form(form_selector)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error submitting form: {str(e)}"
|
|
|
|
@function_tool
|
|
async def fill_field_by_name(context: RunContext, field_name: str, value: str):
|
|
"""Fill a form field using enhanced discovery with intelligent fallback (chrome_get_interactive_elements -> chrome_get_web_content)"""
|
|
try:
|
|
result = await self.mcp_client.smart_fill_with_target_tracking(field_name, value)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error filling field by name: {str(e)}"
|
|
|
|
@function_tool
|
|
async def fill_field_with_voice_command(context: RunContext, voice_command: str):
|
|
"""
|
|
Process natural language voice commands for form filling.
|
|
Examples: 'fill email with john@example.com', 'enter password secret123', 'type hello in search box'
|
|
"""
|
|
try:
|
|
# Use the MCP client's voice command processing which includes dynamic discovery
|
|
result = await self.mcp_client.execute_voice_command(voice_command)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error processing voice command: {str(e)}"
|
|
|
|
@function_tool
|
|
async def discover_and_fill_field(context: RunContext, field_description: str, value: str):
|
|
"""
|
|
Dynamically discover and fill a form field using enhanced discovery with intelligent fallback.
|
|
Uses chrome_get_interactive_elements first, then chrome_get_web_content if that fails.
|
|
"""
|
|
try:
|
|
# Use the enhanced smart fill method with fallback
|
|
result = await self.mcp_client.smart_fill_with_target_tracking(field_description, value)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error in enhanced field discovery: {str(e)}"
|
|
|
|
@function_tool
|
|
async def fill_field_realtime_only(context: RunContext, field_name: str, value: str):
|
|
"""
|
|
Fill a form field using enhanced discovery with intelligent fallback - NO CACHE.
|
|
Uses chrome_get_interactive_elements first, then chrome_get_web_content if that fails.
|
|
"""
|
|
try:
|
|
# Use the enhanced smart fill method with fallback
|
|
result = await self.mcp_client.smart_fill_with_target_tracking(field_name, value)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error in enhanced field filling: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_realtime_form_fields(context: RunContext):
|
|
"""
|
|
Get form fields using ONLY real-time MCP discovery - no cached data.
|
|
Always fetches fresh form elements from the current page.
|
|
"""
|
|
try:
|
|
result = await self.mcp_client._get_form_fields_mcp()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting real-time form fields: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_page_content(context: RunContext):
|
|
"""Get the current page content including text and structure"""
|
|
try:
|
|
result = await self.mcp_client._get_page_content_mcp()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting page content: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_interactive_elements(context: RunContext):
|
|
"""Get all interactive elements (buttons, links, etc.) on the current page"""
|
|
try:
|
|
result = await self.mcp_client._get_interactive_elements_mcp()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting interactive elements: {str(e)}"
|
|
|
|
@function_tool
|
|
async def smart_click_element(context: RunContext, element_description: str):
|
|
"""
|
|
Smart click with enhanced discovery and intelligent fallback (chrome_get_interactive_elements -> chrome_get_web_content).
|
|
Examples: 'Login button', 'Sign up link', 'Submit', 'Menu'
|
|
"""
|
|
try:
|
|
result = await self.mcp_client.smart_click_with_target_tracking(element_description)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error in smart click: {str(e)}"
|
|
|
|
@function_tool
|
|
async def process_voice_command(context: RunContext, command: str):
|
|
"""
|
|
Process natural language voice commands with enhanced real-time capabilities.
|
|
This is the main entry point for all voice-based web automation.
|
|
|
|
Examples:
|
|
- "fill email with john@example.com"
|
|
- "click login button"
|
|
- "enter password secret123"
|
|
- "what's on this page"
|
|
- "show me form fields"
|
|
- "search for python tutorials"
|
|
"""
|
|
try:
|
|
result = await self.mcp_client.process_natural_language_command(command)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error processing voice command: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_cached_input_fields(context: RunContext):
|
|
"""Get the currently cached input fields that were auto-detected"""
|
|
try:
|
|
result = await self.mcp_client.get_cached_input_fields()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting cached input fields: {str(e)}"
|
|
|
|
@function_tool
|
|
async def refresh_input_fields(context: RunContext):
|
|
"""Manually refresh the input field cache for the current page"""
|
|
try:
|
|
result = await self.mcp_client.refresh_input_fields()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error refreshing input fields: {str(e)}"
|
|
|
|
@function_tool
|
|
async def type_in_focused(context: RunContext, text: str):
|
|
"""Type text in the currently focused element or find a suitable input field"""
|
|
try:
|
|
result = await self.mcp_client._type_in_focused_element(text)
|
|
await self.screen_share.update_screen()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error typing in focused element: {str(e)}"
|
|
|
|
# Legacy methods for backward compatibility
|
|
@function_tool
|
|
async def get_cached_form_fields(context: RunContext):
|
|
"""Legacy method - Get cached input fields (redirects to get_cached_input_fields)"""
|
|
try:
|
|
result = await self.mcp_client.get_cached_form_fields()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error getting cached form fields: {str(e)}"
|
|
|
|
@function_tool
|
|
async def refresh_form_fields(context: RunContext):
|
|
"""Legacy method - Refresh input fields (redirects to refresh_input_fields)"""
|
|
try:
|
|
result = await self.mcp_client.refresh_form_fields()
|
|
return result
|
|
except Exception as e:
|
|
return f"Error refreshing form fields: {str(e)}"
|
|
|
|
@function_tool
|
|
async def execute_field_workflow(context: RunContext, field_name: str, field_value: str, actions: str = ""):
|
|
"""
|
|
Execute enhanced field detection and filling workflow with automatic MCP-based field detection.
|
|
|
|
This implements the complete workflow for handling missing webpage fields:
|
|
1. Automatically detect and retrieve the correct CSS selector using MCP tools
|
|
2. Use the retrieved selector to locate and fill the field with the appropriate data
|
|
3. Execute required actions (form submission, button click, navigation) after successful field filling
|
|
|
|
Args:
|
|
field_name: Name or identifier of the field to find (e.g., "email", "password", "search")
|
|
field_value: Value to fill in the field
|
|
actions: JSON string of actions to execute after field filling. Format:
|
|
'[{"type": "submit", "target": "form"}, {"type": "click", "target": "button[type=submit]"}]'
|
|
|
|
Action types supported:
|
|
- submit: Submit a form (target: form selector, optional)
|
|
- click: Click an element (target: CSS selector, required)
|
|
- navigate: Navigate to URL (target: URL, required)
|
|
- wait: Wait for time (target: seconds as string, default: 1.0)
|
|
- keyboard: Send keyboard input (target: keys like "Enter", "Tab", required)
|
|
|
|
Returns detailed workflow execution results including success status and any errors.
|
|
"""
|
|
try:
|
|
# Parse actions if provided
|
|
parsed_actions = []
|
|
if actions.strip():
|
|
import json
|
|
try:
|
|
parsed_actions = json.loads(actions)
|
|
except json.JSONDecodeError as e:
|
|
return f"Error parsing actions JSON: {str(e)}"
|
|
|
|
# Execute the workflow
|
|
result = await self.mcp_client.execute_field_workflow(
|
|
field_name=field_name,
|
|
field_value=field_value,
|
|
actions=parsed_actions,
|
|
max_retries=3
|
|
)
|
|
|
|
# Update screen after workflow execution
|
|
await self.screen_share.update_screen()
|
|
|
|
# Format the result for better readability
|
|
if result["success"]:
|
|
status = "✓ SUCCESS"
|
|
details = [
|
|
f"Field '{field_name}' filled successfully using {result.get('detection_method', 'unknown')} method",
|
|
f"Execution time: {result['execution_time']:.2f}s"
|
|
]
|
|
|
|
if result["actions_executed"]:
|
|
successful_actions = [a for a in result["actions_executed"] if a["success"]]
|
|
failed_actions = [a for a in result["actions_executed"] if not a["success"]]
|
|
|
|
details.append(f"Actions executed: {len(successful_actions)}/{len(result['actions_executed'])} successful")
|
|
|
|
if failed_actions:
|
|
details.append("Failed actions:")
|
|
for action in failed_actions:
|
|
details.append(f" - {action['action_type']}: {action.get('error', 'Unknown error')}")
|
|
else:
|
|
status = "✗ FAILED"
|
|
details = [
|
|
f"Field '{field_name}' could not be filled",
|
|
f"Execution time: {result['execution_time']:.2f}s"
|
|
]
|
|
|
|
if result["errors"]:
|
|
details.append("Errors:")
|
|
for error in result["errors"]:
|
|
details.append(f" - {error}")
|
|
|
|
return f"{status}\n" + "\n".join(details)
|
|
|
|
except Exception as e:
|
|
return f"Error executing field workflow: {str(e)}"
|
|
|
|
# Debugging and troubleshooting tools
|
|
@function_tool
|
|
async def debug_voice_command(context: RunContext, command: str):
|
|
"""Debug a voice command to see how it's parsed and executed step by step"""
|
|
try:
|
|
debug_result = await self.selector_debugger.debug_voice_command(command)
|
|
return f"Debug results for '{command}':\n{json.dumps(debug_result, indent=2, default=str)}"
|
|
except Exception as e:
|
|
return f"Error debugging voice command: {str(e)}"
|
|
|
|
@function_tool
|
|
async def validate_browser_connection(context: RunContext):
|
|
"""Check browser connection status and responsiveness"""
|
|
try:
|
|
validation_result = await self.mcp_client.validate_browser_connection()
|
|
return f"Browser validation results:\n{json.dumps(validation_result, indent=2, default=str)}"
|
|
except Exception as e:
|
|
return f"Error validating browser connection: {str(e)}"
|
|
|
|
@function_tool
|
|
async def test_selectors(context: RunContext, selectors: str):
|
|
"""Test a list of CSS selectors (comma-separated) to see which ones work"""
|
|
try:
|
|
selector_list = [s.strip() for s in selectors.split(',')]
|
|
test_results = await self.selector_debugger.test_common_selectors(selector_list)
|
|
return f"Selector test results:\n{json.dumps(test_results, indent=2, default=str)}"
|
|
except Exception as e:
|
|
return f"Error testing selectors: {str(e)}"
|
|
|
|
@function_tool
|
|
async def capture_browser_state(context: RunContext):
|
|
"""Capture current browser state for debugging"""
|
|
try:
|
|
state = await self.browser_monitor.capture_state()
|
|
issues = self.browser_monitor.detect_issues(state)
|
|
result = {
|
|
"state": state,
|
|
"detected_issues": issues
|
|
}
|
|
return f"Browser state captured:\n{json.dumps(result, indent=2, default=str)}"
|
|
except Exception as e:
|
|
return f"Error capturing browser state: {str(e)}"
|
|
|
|
@function_tool
|
|
async def get_debug_summary(context: RunContext):
|
|
"""Get a summary of all debugging sessions"""
|
|
try:
|
|
summary = self.selector_debugger.get_debug_summary()
|
|
return f"Debug summary:\n{json.dumps(summary, indent=2, default=str)}"
|
|
except Exception as e:
|
|
return f"Error getting debug summary: {str(e)}"
|
|
|
|
# Create agent with Chrome automation capabilities
|
|
agent = Agent(
|
|
instructions="""You are an advanced Chrome automation assistant with real-time voice command processing that can help users navigate the web, search for information, and interact with web pages intelligently using natural language.
|
|
|
|
## Enhanced Speech Recognition & Voice Commands
|
|
I automatically correct common speech errors and process natural language commands:
|
|
- "google" → opens Google.com
|
|
- "facebook" or "facbook" → opens Facebook.com
|
|
- "tweets", "tweet", or "twitter" → opens Twitter/X.com
|
|
- "qubeCare", "https://app.qubecare.ai/provider/login", or "qubeCare" → opens https://app.qubecare.ai/provider/login
|
|
|
|
## Real-Time Voice Command Processing
|
|
I understand and execute natural language voice commands in real-time:
|
|
|
|
### Form Filling Commands:
|
|
- "fill email with john@example.com" → finds and fills email field
|
|
- "enter password secret123" → finds and fills password field
|
|
- "type hello world in search" → finds search field and types text
|
|
- "username john_doe" → fills username field
|
|
- "phone 123-456-7890" → fills phone field
|
|
|
|
### Clicking Commands:
|
|
- "click login button" → finds and clicks login button
|
|
- "press submit" → finds and clicks submit button
|
|
- "tap on sign up link" → finds and clicks sign up link
|
|
- "click menu" → finds and clicks menu element
|
|
|
|
### Content Retrieval Commands:
|
|
- "what's on this page" → gets page content
|
|
- "show me the form fields" → lists all form fields
|
|
- "what can I click" → shows interactive elements
|
|
- "get page content" → retrieves page text
|
|
|
|
## Core Automation Capabilities
|
|
|
|
### Navigation Commands:
|
|
- "go to google" or "google" - Opens Google
|
|
- "go to facebook" or "facebook" - Opens Facebook
|
|
- "go to twitter", "tweets", or "tweet" - Opens Twitter/X
|
|
- "navigate to [URL]" - Opens any website
|
|
- "go back" - Navigate to previous page
|
|
- "go forward" - Navigate to next page
|
|
- "refresh page" - Reload current page
|
|
|
|
### Search Workflow:
|
|
1. **Open search engine**: Navigate to Google or specified site
|
|
2. **Find search elements**: Automatically detect search input fields
|
|
3. **Fill search query**: Type the search terms
|
|
4. **Submit search**: Press Enter or click search button
|
|
5. **Extract results**: Get search results and clickable elements
|
|
6. **Click relevant results**: Find and click on relevant search results
|
|
|
|
### Advanced Search Methods:
|
|
- **search_with_text_input**: Fill search field and press Enter (preferred method)
|
|
- **search_with_button_click**: Fill search field and click search button
|
|
- **search_google**: Complete Google search with results extraction
|
|
|
|
### Element Interaction:
|
|
- **Find elements**: Automatically detect clickable elements on pages
|
|
- **Click elements**: Click buttons, links, and interactive elements
|
|
- **Type text**: Fill forms and input fields
|
|
- **Extract content**: Get text content from web pages
|
|
|
|
### Input Field Handling:
|
|
- **get_form_fields**: Discover all form fields on the current page
|
|
- **fill_form_field**: Fill a specific form field with a value
|
|
- **get_form_field_info**: Get detailed information about a form field
|
|
- **fill_form_step_by_step**: Fill multiple form fields one by one with JSON data
|
|
- **submit_form**: Submit a form after filling all required fields
|
|
- **fill_field_by_name**: Fill any input field using natural language with dynamic discovery
|
|
- **fill_field_with_voice_command**: Process natural language voice commands for form filling
|
|
- **discover_and_fill_field**: Dynamically discover and fill fields using real-time MCP tools
|
|
- **get_cached_input_fields**: View auto-detected input fields from the current page
|
|
- **refresh_input_fields**: Manually refresh the input field cache
|
|
- **type_in_focused**: Type text in the currently focused element or find suitable input field
|
|
- **execute_field_workflow**: Enhanced workflow for missing fields with automatic MCP detection and actions
|
|
|
|
### Real-Time Content Analysis:
|
|
- **get_page_content**: Get current page content including text and structure
|
|
- **get_interactive_elements**: Get all interactive elements (buttons, links, etc.) on the page
|
|
- **get_realtime_form_fields**: Get form fields using real-time MCP discovery (no cache)
|
|
- **smart_click_element**: Smart click that finds elements by text content, labels, or descriptions
|
|
|
|
### Real-Time Form Discovery (NO CACHE):
|
|
The agent features REAL-TIME form field discovery that:
|
|
- **NEVER uses cached selectors** - always gets fresh selectors using MCP tools
|
|
- **Real-time discovery only** - uses chrome_get_interactive_elements and chrome_get_content_web_form
|
|
- **No hardcoded selectors** - all form elements discovered dynamically on every request
|
|
- **Multiple retry strategies** when fields are not found on first attempt
|
|
- **Maps natural language to form fields** intelligently (e.g., "email" → email input, "search" → search box)
|
|
- **Adapts to any website** by analyzing current page structure in real-time
|
|
- **Robust error handling** with multiple fallback discovery methods
|
|
|
|
### Real-Time Functions:
|
|
- **fill_field_realtime_only**: Guarantees fresh selector discovery on every call
|
|
- **get_realtime_form_fields**: Gets form fields using only real-time MCP discovery
|
|
- **discover_and_fill_field**: Pure real-time discovery without any cache dependency
|
|
|
|
## Search Process Details:
|
|
When performing searches:
|
|
1. Navigate to the search engine (usually Google)
|
|
2. Locate search input field using selectors: `input[name='q']`, `textarea[name='q']`
|
|
3. Fill the search field with the query text
|
|
4. Press Enter key to submit the search
|
|
5. Wait for results to load (3 seconds)
|
|
6. Extract search results using content selectors
|
|
7. Find clickable elements for further interaction
|
|
8. Click on relevant results when requested
|
|
|
|
## Element Finding Strategy:
|
|
- Use `chrome_get_interactive_elements` to find all clickable elements
|
|
- Search for elements by text content when needed
|
|
- Use multiple CSS selector strategies for reliability
|
|
- Handle dynamic content and wait for page loads
|
|
|
|
## Error Handling:
|
|
- Retry failed operations with alternative selectors
|
|
- Provide clear feedback on automation steps
|
|
- Handle timeouts and navigation delays
|
|
- Log all actions for debugging
|
|
|
|
Always provide helpful information from search results and explain what actions are being performed during automation.
|
|
|
|
## Input Field Handling Workflow:
|
|
When working with any input fields:
|
|
1. **Auto-detection**: All input fields are automatically detected when navigating to new pages
|
|
2. **Natural language filling**: Use `fill_field_by_name` with natural language like "fill search with python"
|
|
3. **Quick typing**: Use `type_in_focused` to type in currently focused element or find suitable input
|
|
4. **View cached fields**: Use `get_cached_input_fields` to see auto-detected fields
|
|
5. **Manual discovery**: Use `get_form_fields` to manually discover all available form fields
|
|
6. **Get field details**: Use `get_form_field_info` for specific field information
|
|
7. **Fill individual fields**: Use `fill_form_field` to fill one field at a time with exact selectors
|
|
8. **Fill multiple fields**: Use `fill_form_step_by_step` with JSON data for batch filling
|
|
9. **Submit form**: Use `submit_form` to submit the completed form
|
|
|
|
## Natural Language Input Filling:
|
|
The agent now supports natural language commands for any input field:
|
|
- "fill search with python programming" - fills search field
|
|
- "enter password secret123" - fills password field
|
|
- "put John Smith in name field" - fills name field
|
|
- "phone 1234567890" - fills phone field
|
|
- "type hello world" - types in focused element or finds suitable input
|
|
- "search field machine learning" - fills search field
|
|
- "text input hello" - fills text input
|
|
|
|
All input fields (search, text, email, password, etc.) are automatically detected when pages load and cached for quick access.
|
|
|
|
## Form Data Format:
|
|
For `fill_form_step_by_step`, use JSON format like:
|
|
```json
|
|
{
|
|
"input[name='email']": "user@example.com",
|
|
"input[name='password']": "password123",
|
|
"select[name='country']": "United States",
|
|
"textarea[name='message']": "Hello world"
|
|
}
|
|
```
|
|
|
|
Always explain each step when filling forms and confirm successful completion.
|
|
|
|
## Enhanced Field Workflow:
|
|
The `execute_field_workflow` function implements an advanced workflow for handling missing webpage fields:
|
|
|
|
### Workflow Steps:
|
|
1. **Automatic Field Detection**: Uses MCP tools to detect fields through multiple strategies:
|
|
- Cached fields (fastest, most reliable)
|
|
- Enhanced detection with intelligent selectors
|
|
- Label analysis (context-based)
|
|
- Content analysis (page text analysis)
|
|
- Fallback patterns (last resort)
|
|
|
|
2. **Field Filling**: Once detected, fills the field with the provided value
|
|
|
|
3. **Action Execution**: Executes specified actions after successful field filling:
|
|
- `submit`: Submit a form
|
|
- `click`: Click an element
|
|
- `navigate`: Navigate to a URL
|
|
- `wait`: Wait for specified time
|
|
- `keyboard`: Send keyboard input
|
|
|
|
### Usage Examples:
|
|
```
|
|
execute_field_workflow("email", "user@example.com", '[{"type": "submit"}]')
|
|
execute_field_workflow("search", "python tutorial", '[{"type": "keyboard", "target": "Enter"}]')
|
|
execute_field_workflow("password", "secret123", '[{"type": "click", "target": "button[type=submit]"}]')
|
|
```
|
|
|
|
This workflow provides robust error handling and detailed execution results.""",
|
|
tools=[navigate_to_url, go_to_google, go_to_facebook, go_to_twitter, search_google, search_with_text_input, search_with_button_click, click_element, type_text, get_search_results, get_form_fields, fill_form_field, get_form_field_info, fill_form_step_by_step, fill_qubecare_login, submit_form, fill_field_by_name, fill_field_with_voice_command, discover_and_fill_field, fill_field_realtime_only, get_realtime_form_fields, get_page_content, get_interactive_elements, smart_click_element, process_voice_command, get_cached_input_fields, refresh_input_fields, type_in_focused, get_cached_form_fields, refresh_form_fields, execute_field_workflow, debug_voice_command, validate_browser_connection, test_selectors, capture_browser_state, get_debug_summary]
|
|
)
|
|
|
|
# Create agent session with voice pipeline and balanced VAD for better speech recognition
|
|
self.agent_session = AgentSession(
|
|
vad=silero.VAD.load(
|
|
# Balanced settings to prevent speech fragmentation and "astic astic" issues
|
|
min_speech_duration=0.3, # Longer duration to capture complete words
|
|
min_silence_duration=0.5, # Longer silence to prevent word splitting
|
|
prefix_padding_duration=0.3, # More padding to capture word beginnings
|
|
max_buffered_speech=15.0, # Larger buffer for complete phrases
|
|
activation_threshold=0.6, # Lower threshold for better word capture
|
|
sample_rate=16000, # Standard rate for Silero
|
|
force_cpu=True, # Force CPU for consistency and avoid GPU overhead
|
|
),
|
|
stt=deepgram.STT(model="nova-2"),
|
|
llm=openai.LLM(model="gpt-4o-mini"),
|
|
tts=deepgram.TTS(),
|
|
)
|
|
|
|
# Start screen sharing if enabled
|
|
await self.screen_share.start_sharing(ctx.room)
|
|
|
|
# Start the agent session
|
|
await self.agent_session.start(agent=agent, room=ctx.room)
|
|
|
|
# Generate initial greeting
|
|
await self.agent_session.generate_reply(
|
|
instructions="""Greet the user warmly and explain that you are an advanced Chrome automation assistant with real-time voice command processing and comprehensive web automation capabilities.
|
|
|
|
Mention that you can:
|
|
- Navigate to websites with natural voice commands (Google, Facebook, Twitter/X)
|
|
- Perform intelligent web searches with automatic result extraction
|
|
- Find and click on web elements using natural language descriptions
|
|
- Handle complex web interactions with real-time element discovery
|
|
- Process natural language voice commands for all web automation tasks
|
|
|
|
Highlight the REAL-TIME voice command processing: "I understand and execute natural language voice commands in real-time! You can say things like:
|
|
- 'fill email with john@example.com' - I'll find and fill the email field
|
|
- 'click login button' - I'll find and click the login button
|
|
- 'enter password secret123' - I'll find and fill the password field
|
|
- 'what's on this page' - I'll get the page content for you
|
|
- 'show me the form fields' - I'll list all available form fields
|
|
- 'click submit' - I'll find and click the submit button
|
|
|
|
My system features COMPLETE REAL-TIME processing - I NEVER use cached selectors! Every voice command triggers fresh discovery using MCP tools to find elements in real-time from the current page. Whether you're asking me to fill a form, click a button, or get page content, I analyze the page structure live and adapt to any website dynamically."
|
|
|
|
Explain that the speech recognition automatically corrects common pronunciation errors for popular websites.
|
|
|
|
Ask what they would like to do - search for something, visit a website, or interact with a page they're already on."""
|
|
)
|
|
|
|
|
|
def substitute_env_vars(text: str) -> str:
|
|
"""Substitute environment variables in text using ${VAR_NAME} syntax"""
|
|
def replace_var(match):
|
|
var_name = match.group(1)
|
|
return os.getenv(var_name, match.group(0)) # Return original if env var not found
|
|
|
|
return re.sub(r'\$\{([^}]+)\}', replace_var, text)
|
|
|
|
|
|
def substitute_env_vars_in_dict(data):
|
|
"""Recursively substitute environment variables in a dictionary"""
|
|
if isinstance(data, dict):
|
|
return {key: substitute_env_vars_in_dict(value) for key, value in data.items()}
|
|
elif isinstance(data, list):
|
|
return [substitute_env_vars_in_dict(item) for item in data]
|
|
elif isinstance(data, str):
|
|
return substitute_env_vars(data)
|
|
else:
|
|
return data
|
|
|
|
|
|
def load_config(config_path: str = "livekit_config.yaml") -> AgentConfig:
|
|
"""Load configuration from YAML file"""
|
|
with open(config_path, 'r') as f:
|
|
config_data = yaml.safe_load(f)
|
|
|
|
# Substitute environment variables in the entire config
|
|
config_data = substitute_env_vars_in_dict(config_data)
|
|
|
|
# Get environment variables for sensitive data
|
|
api_key = os.getenv('LIVEKIT_API_KEY') or config_data['livekit']['api_key']
|
|
api_secret = os.getenv('LIVEKIT_API_SECRET') or config_data['livekit']['api_secret']
|
|
|
|
# Load MCP server configuration from mcp_livekit_config.yaml if available
|
|
mcp_config_path = "mcp_livekit_config.yaml"
|
|
mcp_server_config = {}
|
|
|
|
try:
|
|
with open(mcp_config_path, 'r') as f:
|
|
mcp_config_data = yaml.safe_load(f)
|
|
# Substitute environment variables in MCP config
|
|
mcp_config_data = substitute_env_vars_in_dict(mcp_config_data)
|
|
# Use chrome-http server configuration
|
|
chrome_http_config = mcp_config_data.get('mcp_servers', {}).get('chrome-http', {})
|
|
if chrome_http_config:
|
|
mcp_server_config = {
|
|
'mcp_server_type': 'http',
|
|
'mcp_server_url': chrome_http_config.get('url', 'http://127.0.0.1:12306/mcp'),
|
|
'mcp_server_command': None,
|
|
'mcp_server_args': []
|
|
}
|
|
except FileNotFoundError:
|
|
# Fallback to config from main config file
|
|
pass
|
|
|
|
# Use MCP config if available, otherwise fallback to main config
|
|
if mcp_server_config:
|
|
chrome_config = mcp_server_config
|
|
else:
|
|
chrome_config = {
|
|
'mcp_server_type': config_data['chrome'].get('mcp_server_type', 'http'),
|
|
'mcp_server_url': config_data['chrome'].get('mcp_server_url', 'http://127.0.0.1:12306/mcp'),
|
|
'mcp_server_command': config_data['chrome'].get('mcp_server_command'),
|
|
'mcp_server_args': config_data['chrome'].get('mcp_server_args', [])
|
|
}
|
|
|
|
return AgentConfig(
|
|
livekit_url=config_data['livekit']['url'],
|
|
api_key=api_key,
|
|
api_secret=api_secret,
|
|
room_name=config_data['livekit']['room']['name'],
|
|
agent_name=config_data['livekit']['agent']['name'],
|
|
mcp_server_type=chrome_config['mcp_server_type'],
|
|
mcp_server_url=chrome_config['mcp_server_url'],
|
|
mcp_server_command=chrome_config['mcp_server_command'],
|
|
mcp_server_args=chrome_config['mcp_server_args'],
|
|
browser_profile=config_data['chrome']['browser_profile']
|
|
)
|
|
|
|
|
|
async def entrypoint(ctx: JobContext):
|
|
"""Entry point for the LiveKit agent"""
|
|
# Set up logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
# Load configuration
|
|
config = load_config()
|
|
|
|
# Create and run agent
|
|
agent = LiveKitChromeAgent(config)
|
|
|
|
# Run the agent entrypoint
|
|
await agent.entrypoint(ctx)
|
|
|
|
|
|
def main():
|
|
"""Main function to run the LiveKit agent"""
|
|
# Run with LiveKit CLI
|
|
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|