Files
broswer-automation/agent-livekit/livekit_agent.py
nasir@endelospay.com d97cad1736 first commit
2025-08-12 02:54:17 +05:00

1020 lines
46 KiB
Python

#!/usr/bin/env python3
"""
LiveKit Agent for MCP Chrome Bridge Integration
This agent provides real-time audio/video communication with Chrome automation capabilities.
For detailed information about MCP tool response handling, see:
docs/MCP_RESPONSE_HANDLING.md
"""
import logging
import os
import yaml
import asyncio
import re
import json
from typing import Optional
from dataclasses import dataclass
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
from livekit import rtc
from livekit.agents import (
Agent,
AgentSession,
JobContext,
WorkerOptions,
cli,
function_tool,
RunContext
)
from livekit.plugins import openai, deepgram, silero
from mcp_chrome_client import MCPChromeClient
from screen_share import ScreenShareHandler
from debug_utils import SelectorDebugger, BrowserStateMonitor
@dataclass
class AgentConfig:
"""Configuration for the LiveKit agent"""
livekit_url: str
api_key: str
api_secret: str
room_name: str
agent_name: str
mcp_server_type: str
mcp_server_url: str
mcp_server_command: str
mcp_server_args: list
browser_profile: str
class LiveKitChromeAgent:
"""Main LiveKit agent class for Chrome automation"""
def __init__(self, config: AgentConfig):
self.config = config
self.logger = logging.getLogger(__name__)
# Initialize components
chrome_config = {
'mcp_server_type': config.mcp_server_type,
'mcp_server_url': config.mcp_server_url,
'mcp_server_command': config.mcp_server_command,
'mcp_server_args': config.mcp_server_args
}
self.mcp_client = MCPChromeClient(chrome_config)
self.screen_share = ScreenShareHandler()
# Debug utilities
self.selector_debugger = SelectorDebugger(self.mcp_client, self.logger)
self.browser_monitor = BrowserStateMonitor(self.mcp_client, self.logger)
# LiveKit components
self.room: Optional[rtc.Room] = None
self.participant: Optional[rtc.RemoteParticipant] = None
self.agent_session: Optional[AgentSession] = None
async def initialize(self):
"""Initialize the agent and its components"""
try:
await self.mcp_client.connect()
await self.screen_share.initialize()
self.logger.info("Agent initialized successfully")
except Exception as e:
self.logger.error(f"Failed to initialize agent: {e}")
raise
async def entrypoint(self, ctx: JobContext):
"""Main entry point for the LiveKit agent"""
self.logger.info(f"Starting agent for room: {ctx.room.name}")
# Connect to the room first
await ctx.connect()
# Initialize room and components
self.room = ctx.room
await self.initialize()
# Create Chrome automation tools
@function_tool
async def navigate_to_url(context: RunContext, url: str):
"""Navigate to a specific URL in the browser"""
try:
result = await self.mcp_client._navigate_mcp(url)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error navigating to {url}: {str(e)}"
@function_tool
async def go_to_google(context: RunContext):
"""Open Google in a new tab"""
try:
result = await self.mcp_client._go_to_google_mcp()
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error opening Google: {str(e)}"
@function_tool
async def go_to_facebook(context: RunContext):
"""Open Facebook in a new tab"""
try:
result = await self.mcp_client._go_to_facebook_mcp()
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error opening Facebook: {str(e)}"
@function_tool
async def go_to_twitter(context: RunContext):
"""Open Twitter/X in a new tab"""
try:
result = await self.mcp_client._go_to_twitter_mcp()
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error opening Twitter: {str(e)}"
@function_tool
async def search_google(context: RunContext, query: str):
"""Search for something on Google and return results"""
try:
result = await self.mcp_client._search_google_mcp(query)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error searching Google for '{query}': {str(e)}"
@function_tool
async def search_with_text_input(query: str, search_selector: str = "#APjFqb, textarea[name='q'], [role='combobox'], input[name='q']"):
"""Fill search input field with text and submit using Enter key"""
try:
# Try multiple selectors for better compatibility (updated for modern Google)
selectors_to_try = [
search_selector,
"#APjFqb", # Main Google search box ID
"textarea[name='q']", # Google search textarea
"[role='combobox']", # Role-based selector
".gLFyf", # Google search box class
"textarea[aria-label*='Search']", # Aria-label based
"input[name='q']", # Fallback for other sites
"input[type='search']",
"#search",
"[role='searchbox']",
"input[placeholder*='search' i]",
"input[aria-label*='search' i]"
]
click_result = None
for selector in selectors_to_try:
try:
click_result = await self.mcp_client.execute_voice_command(f"click {selector}")
self.logger.info(f"Successfully clicked selector: {selector}")
break
except Exception as e:
self.logger.debug(f"Failed to click selector {selector}: {e}")
continue
if not click_result:
return f"Error: Could not find any search input field to click"
self.logger.info(f"Click result: {click_result}")
await asyncio.sleep(0.5)
# Clear any existing text and fill the search input field
clear_result = await self.mcp_client.execute_voice_command("keyboard ctrl+a") # Select all
self.logger.debug(f"Clear result: {clear_result}")
await asyncio.sleep(0.2)
type_result = await self.mcp_client.execute_voice_command(f"type {query}")
self.logger.info(f"Type result: {type_result}")
await asyncio.sleep(1)
# Press Enter to submit search
enter_result = await self.mcp_client.execute_voice_command("keyboard enter")
self.logger.info(f"Enter result: {enter_result}")
await asyncio.sleep(2) # Wait for search to process
await self.screen_share.update_screen()
return f"Search submitted with query: '{query}' using text input and Enter key. Results: Click={click_result}, Type={type_result}, Enter={enter_result}"
except Exception as e:
self.logger.error(f"Error in search_with_text_input: {e}")
return f"Error submitting search with text input: {str(e)}"
@function_tool
async def search_with_button_click(query: str, input_selector: str = "#APjFqb, textarea[name='q'], [role='combobox']", button_selector: str = "button[type='submit'], input[type='submit'], .search-button"):
"""Fill search input and click search button"""
try:
# Try multiple input selectors for better compatibility (updated for modern Google)
input_selectors_to_try = [
input_selector,
"#APjFqb", # Main Google search box ID
"textarea[name='q']", # Google search textarea
"[role='combobox']", # Role-based selector
".gLFyf", # Google search box class
"textarea[aria-label*='Search']", # Aria-label based
"input[name='q']", # Fallback for other sites
"textarea[name='q']",
"input[type='search']",
"#search",
"[role='searchbox']",
"input[placeholder*='search' i]",
"input[aria-label*='search' i]"
]
click_result = None
for selector in input_selectors_to_try:
try:
click_result = await self.mcp_client.execute_voice_command(f"click {selector}")
self.logger.info(f"Successfully clicked input selector: {selector}")
break
except Exception as e:
self.logger.debug(f"Failed to click input selector {selector}: {e}")
continue
if not click_result:
return f"Error: Could not find any search input field to click"
self.logger.info(f"Input click result: {click_result}")
await asyncio.sleep(0.5)
# Clear any existing text and type new query
clear_result = await self.mcp_client.execute_voice_command("keyboard ctrl+a") # Select all
self.logger.debug(f"Clear result: {clear_result}")
await asyncio.sleep(0.2)
type_result = await self.mcp_client.execute_voice_command(f"type {query}")
self.logger.info(f"Type result: {type_result}")
await asyncio.sleep(1)
# Try multiple button selectors for better compatibility
button_selectors_to_try = [
button_selector,
"button[type='submit']",
"input[type='submit']",
"button[aria-label*='search' i]",
".search-button",
"[role='button'][aria-label*='search' i]",
"button:contains('Search')",
"input[value*='search' i]"
]
button_result = None
for selector in button_selectors_to_try:
try:
button_result = await self.mcp_client.execute_voice_command(f"click {selector}")
self.logger.info(f"Successfully clicked button selector: {selector}")
break
except Exception as e:
self.logger.debug(f"Failed to click button selector {selector}: {e}")
continue
if not button_result:
# Fallback to Enter key if no button found
self.logger.info("No search button found, falling back to Enter key")
button_result = await self.mcp_client.execute_voice_command("keyboard enter")
self.logger.info(f"Button click result: {button_result}")
await asyncio.sleep(2) # Wait for search to process
await self.screen_share.update_screen()
return f"Search button clicked with query: '{query}'. Results: Input={click_result}, Type={type_result}, Button={button_result}"
except Exception as e:
self.logger.error(f"Error in search_with_button_click: {e}")
return f"Error clicking search button: {str(e)}"
@function_tool
async def click_element(context: RunContext, selector: str):
"""Click on an element using CSS selector"""
try:
result = await self.mcp_client._click_mcp(selector)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error clicking element {selector}: {str(e)}"
@function_tool
async def type_text(context: RunContext, text: str):
"""Type text into the currently focused element"""
try:
result = await self.mcp_client._type_text_mcp(text)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error typing text: {str(e)}"
@function_tool
async def get_search_results(context: RunContext):
"""Extract and return current search results from the page"""
try:
result = await self.mcp_client._get_search_results_mcp()
return result
except Exception as e:
return f"Error getting search results: {str(e)}"
@function_tool
async def get_form_fields(context: RunContext):
"""Get all form fields on the current page"""
try:
result = await self.mcp_client.get_form_fields()
return result
except Exception as e:
return f"Error getting form fields: {str(e)}"
@function_tool
async def fill_form_field(context: RunContext, field_selector: str, value: str):
"""Fill a specific form field with a value using target element tracking"""
try:
# Use enhanced fill method that tracks target elements
result = await self.mcp_client.fill_input_field(field_selector, value)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error filling form field {field_selector}: {str(e)}"
@function_tool
async def get_form_field_info(context: RunContext, field_selector: str):
"""Get detailed information about a specific form field"""
try:
result = await self.mcp_client.get_form_field_info(field_selector)
return result
except Exception as e:
return f"Error getting form field info for {field_selector}: {str(e)}"
@function_tool
async def fill_form_step_by_step(context: RunContext, form_data: str):
"""Fill form fields one by one with provided data (JSON format)"""
try:
result = await self.mcp_client.fill_form_step_by_step(form_data)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error filling form step by step: {str(e)}"
@function_tool
async def fill_qubecare_login(context: RunContext, email: str, password: str):
"""Fill QuBeCare login form with email and password"""
try:
result = await self.mcp_client.fill_qubecare_login(email, password)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error filling QuBeCare login form: {str(e)}"
@function_tool
async def submit_form(context: RunContext, form_selector: str = "form"):
"""Submit a form on the current page"""
try:
result = await self.mcp_client.submit_form(form_selector)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error submitting form: {str(e)}"
@function_tool
async def fill_field_by_name(context: RunContext, field_name: str, value: str):
"""Fill a form field using enhanced discovery with intelligent fallback (chrome_get_interactive_elements -> chrome_get_web_content)"""
try:
result = await self.mcp_client.smart_fill_with_target_tracking(field_name, value)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error filling field by name: {str(e)}"
@function_tool
async def fill_field_with_voice_command(context: RunContext, voice_command: str):
"""
Process natural language voice commands for form filling.
Examples: 'fill email with john@example.com', 'enter password secret123', 'type hello in search box'
"""
try:
# Use the MCP client's voice command processing which includes dynamic discovery
result = await self.mcp_client.execute_voice_command(voice_command)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error processing voice command: {str(e)}"
@function_tool
async def discover_and_fill_field(context: RunContext, field_description: str, value: str):
"""
Dynamically discover and fill a form field using enhanced discovery with intelligent fallback.
Uses chrome_get_interactive_elements first, then chrome_get_web_content if that fails.
"""
try:
# Use the enhanced smart fill method with fallback
result = await self.mcp_client.smart_fill_with_target_tracking(field_description, value)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error in enhanced field discovery: {str(e)}"
@function_tool
async def fill_field_realtime_only(context: RunContext, field_name: str, value: str):
"""
Fill a form field using enhanced discovery with intelligent fallback - NO CACHE.
Uses chrome_get_interactive_elements first, then chrome_get_web_content if that fails.
"""
try:
# Use the enhanced smart fill method with fallback
result = await self.mcp_client.smart_fill_with_target_tracking(field_name, value)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error in enhanced field filling: {str(e)}"
@function_tool
async def get_realtime_form_fields(context: RunContext):
"""
Get form fields using ONLY real-time MCP discovery - no cached data.
Always fetches fresh form elements from the current page.
"""
try:
result = await self.mcp_client._get_form_fields_mcp()
return result
except Exception as e:
return f"Error getting real-time form fields: {str(e)}"
@function_tool
async def get_page_content(context: RunContext):
"""Get the current page content including text and structure"""
try:
result = await self.mcp_client._get_page_content_mcp()
return result
except Exception as e:
return f"Error getting page content: {str(e)}"
@function_tool
async def get_interactive_elements(context: RunContext):
"""Get all interactive elements (buttons, links, etc.) on the current page"""
try:
result = await self.mcp_client._get_interactive_elements_mcp()
return result
except Exception as e:
return f"Error getting interactive elements: {str(e)}"
@function_tool
async def smart_click_element(context: RunContext, element_description: str):
"""
Smart click with enhanced discovery and intelligent fallback (chrome_get_interactive_elements -> chrome_get_web_content).
Examples: 'Login button', 'Sign up link', 'Submit', 'Menu'
"""
try:
result = await self.mcp_client.smart_click_with_target_tracking(element_description)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error in smart click: {str(e)}"
@function_tool
async def process_voice_command(context: RunContext, command: str):
"""
Process natural language voice commands with enhanced real-time capabilities.
This is the main entry point for all voice-based web automation.
Examples:
- "fill email with john@example.com"
- "click login button"
- "enter password secret123"
- "what's on this page"
- "show me form fields"
- "search for python tutorials"
"""
try:
result = await self.mcp_client.process_natural_language_command(command)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error processing voice command: {str(e)}"
@function_tool
async def get_cached_input_fields(context: RunContext):
"""Get the currently cached input fields that were auto-detected"""
try:
result = await self.mcp_client.get_cached_input_fields()
return result
except Exception as e:
return f"Error getting cached input fields: {str(e)}"
@function_tool
async def refresh_input_fields(context: RunContext):
"""Manually refresh the input field cache for the current page"""
try:
result = await self.mcp_client.refresh_input_fields()
return result
except Exception as e:
return f"Error refreshing input fields: {str(e)}"
@function_tool
async def type_in_focused(context: RunContext, text: str):
"""Type text in the currently focused element or find a suitable input field"""
try:
result = await self.mcp_client._type_in_focused_element(text)
await self.screen_share.update_screen()
return result
except Exception as e:
return f"Error typing in focused element: {str(e)}"
# Legacy methods for backward compatibility
@function_tool
async def get_cached_form_fields(context: RunContext):
"""Legacy method - Get cached input fields (redirects to get_cached_input_fields)"""
try:
result = await self.mcp_client.get_cached_form_fields()
return result
except Exception as e:
return f"Error getting cached form fields: {str(e)}"
@function_tool
async def refresh_form_fields(context: RunContext):
"""Legacy method - Refresh input fields (redirects to refresh_input_fields)"""
try:
result = await self.mcp_client.refresh_form_fields()
return result
except Exception as e:
return f"Error refreshing form fields: {str(e)}"
@function_tool
async def execute_field_workflow(context: RunContext, field_name: str, field_value: str, actions: str = ""):
"""
Execute enhanced field detection and filling workflow with automatic MCP-based field detection.
This implements the complete workflow for handling missing webpage fields:
1. Automatically detect and retrieve the correct CSS selector using MCP tools
2. Use the retrieved selector to locate and fill the field with the appropriate data
3. Execute required actions (form submission, button click, navigation) after successful field filling
Args:
field_name: Name or identifier of the field to find (e.g., "email", "password", "search")
field_value: Value to fill in the field
actions: JSON string of actions to execute after field filling. Format:
'[{"type": "submit", "target": "form"}, {"type": "click", "target": "button[type=submit]"}]'
Action types supported:
- submit: Submit a form (target: form selector, optional)
- click: Click an element (target: CSS selector, required)
- navigate: Navigate to URL (target: URL, required)
- wait: Wait for time (target: seconds as string, default: 1.0)
- keyboard: Send keyboard input (target: keys like "Enter", "Tab", required)
Returns detailed workflow execution results including success status and any errors.
"""
try:
# Parse actions if provided
parsed_actions = []
if actions.strip():
import json
try:
parsed_actions = json.loads(actions)
except json.JSONDecodeError as e:
return f"Error parsing actions JSON: {str(e)}"
# Execute the workflow
result = await self.mcp_client.execute_field_workflow(
field_name=field_name,
field_value=field_value,
actions=parsed_actions,
max_retries=3
)
# Update screen after workflow execution
await self.screen_share.update_screen()
# Format the result for better readability
if result["success"]:
status = "✓ SUCCESS"
details = [
f"Field '{field_name}' filled successfully using {result.get('detection_method', 'unknown')} method",
f"Execution time: {result['execution_time']:.2f}s"
]
if result["actions_executed"]:
successful_actions = [a for a in result["actions_executed"] if a["success"]]
failed_actions = [a for a in result["actions_executed"] if not a["success"]]
details.append(f"Actions executed: {len(successful_actions)}/{len(result['actions_executed'])} successful")
if failed_actions:
details.append("Failed actions:")
for action in failed_actions:
details.append(f" - {action['action_type']}: {action.get('error', 'Unknown error')}")
else:
status = "✗ FAILED"
details = [
f"Field '{field_name}' could not be filled",
f"Execution time: {result['execution_time']:.2f}s"
]
if result["errors"]:
details.append("Errors:")
for error in result["errors"]:
details.append(f" - {error}")
return f"{status}\n" + "\n".join(details)
except Exception as e:
return f"Error executing field workflow: {str(e)}"
# Debugging and troubleshooting tools
@function_tool
async def debug_voice_command(context: RunContext, command: str):
"""Debug a voice command to see how it's parsed and executed step by step"""
try:
debug_result = await self.selector_debugger.debug_voice_command(command)
return f"Debug results for '{command}':\n{json.dumps(debug_result, indent=2, default=str)}"
except Exception as e:
return f"Error debugging voice command: {str(e)}"
@function_tool
async def validate_browser_connection(context: RunContext):
"""Check browser connection status and responsiveness"""
try:
validation_result = await self.mcp_client.validate_browser_connection()
return f"Browser validation results:\n{json.dumps(validation_result, indent=2, default=str)}"
except Exception as e:
return f"Error validating browser connection: {str(e)}"
@function_tool
async def test_selectors(context: RunContext, selectors: str):
"""Test a list of CSS selectors (comma-separated) to see which ones work"""
try:
selector_list = [s.strip() for s in selectors.split(',')]
test_results = await self.selector_debugger.test_common_selectors(selector_list)
return f"Selector test results:\n{json.dumps(test_results, indent=2, default=str)}"
except Exception as e:
return f"Error testing selectors: {str(e)}"
@function_tool
async def capture_browser_state(context: RunContext):
"""Capture current browser state for debugging"""
try:
state = await self.browser_monitor.capture_state()
issues = self.browser_monitor.detect_issues(state)
result = {
"state": state,
"detected_issues": issues
}
return f"Browser state captured:\n{json.dumps(result, indent=2, default=str)}"
except Exception as e:
return f"Error capturing browser state: {str(e)}"
@function_tool
async def get_debug_summary(context: RunContext):
"""Get a summary of all debugging sessions"""
try:
summary = self.selector_debugger.get_debug_summary()
return f"Debug summary:\n{json.dumps(summary, indent=2, default=str)}"
except Exception as e:
return f"Error getting debug summary: {str(e)}"
# Create agent with Chrome automation capabilities
agent = Agent(
instructions="""You are an advanced Chrome automation assistant with real-time voice command processing that can help users navigate the web, search for information, and interact with web pages intelligently using natural language.
## Enhanced Speech Recognition & Voice Commands
I automatically correct common speech errors and process natural language commands:
- "google" → opens Google.com
- "facebook" or "facbook" → opens Facebook.com
- "tweets", "tweet", or "twitter" → opens Twitter/X.com
- "qubeCare", "https://app.qubecare.ai/provider/login", or "qubeCare" → opens https://app.qubecare.ai/provider/login
## Real-Time Voice Command Processing
I understand and execute natural language voice commands in real-time:
### Form Filling Commands:
- "fill email with john@example.com" → finds and fills email field
- "enter password secret123" → finds and fills password field
- "type hello world in search" → finds search field and types text
- "username john_doe" → fills username field
- "phone 123-456-7890" → fills phone field
### Clicking Commands:
- "click login button" → finds and clicks login button
- "press submit" → finds and clicks submit button
- "tap on sign up link" → finds and clicks sign up link
- "click menu" → finds and clicks menu element
### Content Retrieval Commands:
- "what's on this page" → gets page content
- "show me the form fields" → lists all form fields
- "what can I click" → shows interactive elements
- "get page content" → retrieves page text
## Core Automation Capabilities
### Navigation Commands:
- "go to google" or "google" - Opens Google
- "go to facebook" or "facebook" - Opens Facebook
- "go to twitter", "tweets", or "tweet" - Opens Twitter/X
- "navigate to [URL]" - Opens any website
- "go back" - Navigate to previous page
- "go forward" - Navigate to next page
- "refresh page" - Reload current page
### Search Workflow:
1. **Open search engine**: Navigate to Google or specified site
2. **Find search elements**: Automatically detect search input fields
3. **Fill search query**: Type the search terms
4. **Submit search**: Press Enter or click search button
5. **Extract results**: Get search results and clickable elements
6. **Click relevant results**: Find and click on relevant search results
### Advanced Search Methods:
- **search_with_text_input**: Fill search field and press Enter (preferred method)
- **search_with_button_click**: Fill search field and click search button
- **search_google**: Complete Google search with results extraction
### Element Interaction:
- **Find elements**: Automatically detect clickable elements on pages
- **Click elements**: Click buttons, links, and interactive elements
- **Type text**: Fill forms and input fields
- **Extract content**: Get text content from web pages
### Input Field Handling:
- **get_form_fields**: Discover all form fields on the current page
- **fill_form_field**: Fill a specific form field with a value
- **get_form_field_info**: Get detailed information about a form field
- **fill_form_step_by_step**: Fill multiple form fields one by one with JSON data
- **submit_form**: Submit a form after filling all required fields
- **fill_field_by_name**: Fill any input field using natural language with dynamic discovery
- **fill_field_with_voice_command**: Process natural language voice commands for form filling
- **discover_and_fill_field**: Dynamically discover and fill fields using real-time MCP tools
- **get_cached_input_fields**: View auto-detected input fields from the current page
- **refresh_input_fields**: Manually refresh the input field cache
- **type_in_focused**: Type text in the currently focused element or find suitable input field
- **execute_field_workflow**: Enhanced workflow for missing fields with automatic MCP detection and actions
### Real-Time Content Analysis:
- **get_page_content**: Get current page content including text and structure
- **get_interactive_elements**: Get all interactive elements (buttons, links, etc.) on the page
- **get_realtime_form_fields**: Get form fields using real-time MCP discovery (no cache)
- **smart_click_element**: Smart click that finds elements by text content, labels, or descriptions
### Real-Time Form Discovery (NO CACHE):
The agent features REAL-TIME form field discovery that:
- **NEVER uses cached selectors** - always gets fresh selectors using MCP tools
- **Real-time discovery only** - uses chrome_get_interactive_elements and chrome_get_content_web_form
- **No hardcoded selectors** - all form elements discovered dynamically on every request
- **Multiple retry strategies** when fields are not found on first attempt
- **Maps natural language to form fields** intelligently (e.g., "email" → email input, "search" → search box)
- **Adapts to any website** by analyzing current page structure in real-time
- **Robust error handling** with multiple fallback discovery methods
### Real-Time Functions:
- **fill_field_realtime_only**: Guarantees fresh selector discovery on every call
- **get_realtime_form_fields**: Gets form fields using only real-time MCP discovery
- **discover_and_fill_field**: Pure real-time discovery without any cache dependency
## Search Process Details:
When performing searches:
1. Navigate to the search engine (usually Google)
2. Locate search input field using selectors: `input[name='q']`, `textarea[name='q']`
3. Fill the search field with the query text
4. Press Enter key to submit the search
5. Wait for results to load (3 seconds)
6. Extract search results using content selectors
7. Find clickable elements for further interaction
8. Click on relevant results when requested
## Element Finding Strategy:
- Use `chrome_get_interactive_elements` to find all clickable elements
- Search for elements by text content when needed
- Use multiple CSS selector strategies for reliability
- Handle dynamic content and wait for page loads
## Error Handling:
- Retry failed operations with alternative selectors
- Provide clear feedback on automation steps
- Handle timeouts and navigation delays
- Log all actions for debugging
Always provide helpful information from search results and explain what actions are being performed during automation.
## Input Field Handling Workflow:
When working with any input fields:
1. **Auto-detection**: All input fields are automatically detected when navigating to new pages
2. **Natural language filling**: Use `fill_field_by_name` with natural language like "fill search with python"
3. **Quick typing**: Use `type_in_focused` to type in currently focused element or find suitable input
4. **View cached fields**: Use `get_cached_input_fields` to see auto-detected fields
5. **Manual discovery**: Use `get_form_fields` to manually discover all available form fields
6. **Get field details**: Use `get_form_field_info` for specific field information
7. **Fill individual fields**: Use `fill_form_field` to fill one field at a time with exact selectors
8. **Fill multiple fields**: Use `fill_form_step_by_step` with JSON data for batch filling
9. **Submit form**: Use `submit_form` to submit the completed form
## Natural Language Input Filling:
The agent now supports natural language commands for any input field:
- "fill search with python programming" - fills search field
- "enter password secret123" - fills password field
- "put John Smith in name field" - fills name field
- "phone 1234567890" - fills phone field
- "type hello world" - types in focused element or finds suitable input
- "search field machine learning" - fills search field
- "text input hello" - fills text input
All input fields (search, text, email, password, etc.) are automatically detected when pages load and cached for quick access.
## Form Data Format:
For `fill_form_step_by_step`, use JSON format like:
```json
{
"input[name='email']": "user@example.com",
"input[name='password']": "password123",
"select[name='country']": "United States",
"textarea[name='message']": "Hello world"
}
```
Always explain each step when filling forms and confirm successful completion.
## Enhanced Field Workflow:
The `execute_field_workflow` function implements an advanced workflow for handling missing webpage fields:
### Workflow Steps:
1. **Automatic Field Detection**: Uses MCP tools to detect fields through multiple strategies:
- Cached fields (fastest, most reliable)
- Enhanced detection with intelligent selectors
- Label analysis (context-based)
- Content analysis (page text analysis)
- Fallback patterns (last resort)
2. **Field Filling**: Once detected, fills the field with the provided value
3. **Action Execution**: Executes specified actions after successful field filling:
- `submit`: Submit a form
- `click`: Click an element
- `navigate`: Navigate to a URL
- `wait`: Wait for specified time
- `keyboard`: Send keyboard input
### Usage Examples:
```
execute_field_workflow("email", "user@example.com", '[{"type": "submit"}]')
execute_field_workflow("search", "python tutorial", '[{"type": "keyboard", "target": "Enter"}]')
execute_field_workflow("password", "secret123", '[{"type": "click", "target": "button[type=submit]"}]')
```
This workflow provides robust error handling and detailed execution results.""",
tools=[navigate_to_url, go_to_google, go_to_facebook, go_to_twitter, search_google, search_with_text_input, search_with_button_click, click_element, type_text, get_search_results, get_form_fields, fill_form_field, get_form_field_info, fill_form_step_by_step, fill_qubecare_login, submit_form, fill_field_by_name, fill_field_with_voice_command, discover_and_fill_field, fill_field_realtime_only, get_realtime_form_fields, get_page_content, get_interactive_elements, smart_click_element, process_voice_command, get_cached_input_fields, refresh_input_fields, type_in_focused, get_cached_form_fields, refresh_form_fields, execute_field_workflow, debug_voice_command, validate_browser_connection, test_selectors, capture_browser_state, get_debug_summary]
)
# Create agent session with voice pipeline and balanced VAD for better speech recognition
self.agent_session = AgentSession(
vad=silero.VAD.load(
# Balanced settings to prevent speech fragmentation and "astic astic" issues
min_speech_duration=0.3, # Longer duration to capture complete words
min_silence_duration=0.5, # Longer silence to prevent word splitting
prefix_padding_duration=0.3, # More padding to capture word beginnings
max_buffered_speech=15.0, # Larger buffer for complete phrases
activation_threshold=0.6, # Lower threshold for better word capture
sample_rate=16000, # Standard rate for Silero
force_cpu=True, # Force CPU for consistency and avoid GPU overhead
),
stt=deepgram.STT(model="nova-2"),
llm=openai.LLM(model="gpt-4o-mini"),
tts=deepgram.TTS(),
)
# Start screen sharing if enabled
await self.screen_share.start_sharing(ctx.room)
# Start the agent session
await self.agent_session.start(agent=agent, room=ctx.room)
# Generate initial greeting
await self.agent_session.generate_reply(
instructions="""Greet the user warmly and explain that you are an advanced Chrome automation assistant with real-time voice command processing and comprehensive web automation capabilities.
Mention that you can:
- Navigate to websites with natural voice commands (Google, Facebook, Twitter/X)
- Perform intelligent web searches with automatic result extraction
- Find and click on web elements using natural language descriptions
- Handle complex web interactions with real-time element discovery
- Process natural language voice commands for all web automation tasks
Highlight the REAL-TIME voice command processing: "I understand and execute natural language voice commands in real-time! You can say things like:
- 'fill email with john@example.com' - I'll find and fill the email field
- 'click login button' - I'll find and click the login button
- 'enter password secret123' - I'll find and fill the password field
- 'what's on this page' - I'll get the page content for you
- 'show me the form fields' - I'll list all available form fields
- 'click submit' - I'll find and click the submit button
My system features COMPLETE REAL-TIME processing - I NEVER use cached selectors! Every voice command triggers fresh discovery using MCP tools to find elements in real-time from the current page. Whether you're asking me to fill a form, click a button, or get page content, I analyze the page structure live and adapt to any website dynamically."
Explain that the speech recognition automatically corrects common pronunciation errors for popular websites.
Ask what they would like to do - search for something, visit a website, or interact with a page they're already on."""
)
def substitute_env_vars(text: str) -> str:
"""Substitute environment variables in text using ${VAR_NAME} syntax"""
def replace_var(match):
var_name = match.group(1)
return os.getenv(var_name, match.group(0)) # Return original if env var not found
return re.sub(r'\$\{([^}]+)\}', replace_var, text)
def substitute_env_vars_in_dict(data):
"""Recursively substitute environment variables in a dictionary"""
if isinstance(data, dict):
return {key: substitute_env_vars_in_dict(value) for key, value in data.items()}
elif isinstance(data, list):
return [substitute_env_vars_in_dict(item) for item in data]
elif isinstance(data, str):
return substitute_env_vars(data)
else:
return data
def load_config(config_path: str = "livekit_config.yaml") -> AgentConfig:
"""Load configuration from YAML file"""
with open(config_path, 'r') as f:
config_data = yaml.safe_load(f)
# Substitute environment variables in the entire config
config_data = substitute_env_vars_in_dict(config_data)
# Get environment variables for sensitive data
api_key = os.getenv('LIVEKIT_API_KEY') or config_data['livekit']['api_key']
api_secret = os.getenv('LIVEKIT_API_SECRET') or config_data['livekit']['api_secret']
# Load MCP server configuration from mcp_livekit_config.yaml if available
mcp_config_path = "mcp_livekit_config.yaml"
mcp_server_config = {}
try:
with open(mcp_config_path, 'r') as f:
mcp_config_data = yaml.safe_load(f)
# Substitute environment variables in MCP config
mcp_config_data = substitute_env_vars_in_dict(mcp_config_data)
# Use chrome-http server configuration
chrome_http_config = mcp_config_data.get('mcp_servers', {}).get('chrome-http', {})
if chrome_http_config:
mcp_server_config = {
'mcp_server_type': 'http',
'mcp_server_url': chrome_http_config.get('url', 'http://127.0.0.1:12306/mcp'),
'mcp_server_command': None,
'mcp_server_args': []
}
except FileNotFoundError:
# Fallback to config from main config file
pass
# Use MCP config if available, otherwise fallback to main config
if mcp_server_config:
chrome_config = mcp_server_config
else:
chrome_config = {
'mcp_server_type': config_data['chrome'].get('mcp_server_type', 'http'),
'mcp_server_url': config_data['chrome'].get('mcp_server_url', 'http://127.0.0.1:12306/mcp'),
'mcp_server_command': config_data['chrome'].get('mcp_server_command'),
'mcp_server_args': config_data['chrome'].get('mcp_server_args', [])
}
return AgentConfig(
livekit_url=config_data['livekit']['url'],
api_key=api_key,
api_secret=api_secret,
room_name=config_data['livekit']['room']['name'],
agent_name=config_data['livekit']['agent']['name'],
mcp_server_type=chrome_config['mcp_server_type'],
mcp_server_url=chrome_config['mcp_server_url'],
mcp_server_command=chrome_config['mcp_server_command'],
mcp_server_args=chrome_config['mcp_server_args'],
browser_profile=config_data['chrome']['browser_profile']
)
async def entrypoint(ctx: JobContext):
"""Entry point for the LiveKit agent"""
# Set up logging
logging.basicConfig(level=logging.INFO)
# Load configuration
config = load_config()
# Create and run agent
agent = LiveKitChromeAgent(config)
# Run the agent entrypoint
await agent.entrypoint(ctx)
def main():
"""Main function to run the LiveKit agent"""
# Run with LiveKit CLI
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
if __name__ == "__main__":
main()