Files
broswer-automation/test_info_extraction.py

159 lines
6.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Test script to verify the information extraction functionality
"""
import asyncio
import re
async def test_extract_search_information(search_results: str, query: str) -> str:
"""Test version of the extract search information function"""
try:
# Initialize extracted information
extracted = {
'phones': [],
'emails': [],
'addresses': [],
'websites': [],
'business_name': '',
'hours': '',
'summary': ''
}
# Extract phone numbers (improved patterns for international numbers)
phone_patterns = [
r'(\+\d{1,3}[-\.\s]?\d{1,4}[-\.\s]?\d{1,4}[-\.\s]?\d{1,9})', # International format
r'(\(?[0-9]{3}\)?[-\.\s]?[0-9]{3}[-\.\s]?[0-9]{4})', # US format
r'(\d{2,4}[-\.\s]?\d{6,8})', # General format
]
phones = []
for pattern in phone_patterns:
found_phones = re.findall(pattern, search_results)
phones.extend(found_phones)
extracted['phones'] = list(set(phones)) # Remove duplicates
# Extract email addresses
email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
emails = re.findall(email_pattern, search_results)
extracted['emails'] = list(set(emails))
# Extract websites/URLs
url_pattern = r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'
websites = re.findall(url_pattern, search_results)
extracted['websites'] = list(set(websites))
# Extract business hours patterns
hours_patterns = [
r'((?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))',
r'(Hours?:?\s*[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))',
r'(Open:?\s*[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))',
r'(\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\s*-\s*\d{1,2}:\d{2}\s*(?:AM|PM|am|pm))',
r'(\d{1,2}\s*(?:AM|PM|am|pm)\s*-\s*\d{1,2}\s*(?:AM|PM|am|pm))'
]
for pattern in hours_patterns:
hours_match = re.search(pattern, search_results, re.IGNORECASE)
if hours_match:
extracted['hours'] = hours_match.group(1).strip()
break
# Extract addresses
address_patterns = [
r'(\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Circle|Cir|Court|Ct|Place|Pl)[^,]*(?:,\s*[A-Za-z\s]+)*)',
r'([A-Za-z\s]+,\s*[A-Z]{2}\s+\d{5})', # City, State ZIP
r'(\d+\s+[A-Za-z0-9\s,.-]+(?:Pakistan|PK))', # Pakistan addresses
]
for pattern in address_patterns:
address_matches = re.findall(pattern, search_results, re.IGNORECASE)
if address_matches:
extracted['addresses'] = list(set(address_matches))
break
# Try to identify business name from query and results
business_keywords = ['post office', 'bank', 'hospital', 'school', 'office', 'center', 'department']
for keyword in business_keywords:
if keyword in query.lower():
# Look for the business name in results
lines = search_results.split('\n')
for line in lines[:5]: # Check first few lines
if keyword in line.lower() and len(line.strip()) < 100:
extracted['business_name'] = line.strip()
break
break
# Format the response
if any([extracted['phones'], extracted['emails'], extracted['websites'], extracted['hours'], extracted['addresses']]):
response = f"I found information for your search '{query}':\n\n"
if extracted['business_name']:
response += f"🏢 **{extracted['business_name']}**\n\n"
if extracted['phones']:
response += f"📞 **Phone**: {', '.join(extracted['phones'])}\n"
if extracted['emails']:
response += f"📧 **Email**: {', '.join(extracted['emails'])}\n"
if extracted['addresses']:
response += f"📍 **Address**: {', '.join(extracted['addresses'][:2])}\n" # Limit to 2 addresses
if extracted['websites']:
response += f"🌐 **Website**: {', '.join(extracted['websites'][:2])}\n" # Limit to 2 URLs
if extracted['hours']:
response += f"🕒 **Hours**: {extracted['hours']}\n"
# Add a summary from the first few lines of results
lines = search_results.split('\n')
meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 20 and not line.strip().startswith('http')]
if meaningful_lines:
response += f"\n **Additional Info**: {meaningful_lines[0][:200]}...\n"
response += f"\nWould you like me to help you with anything specific, like getting directions or finding more details?"
return response
# If no specific information extracted, return original results
return search_results
except Exception as e:
print(f"Error extracting search information: {e}")
return search_results
# Test with sample search results
async def main():
# Test case 1: Post office search
sample_results_1 = """
Post Office Fortabbas - Pakistan Post
Contact Information
Phone: +92-68-5555123
Email: fortabbas@pakistanpost.gov.pk
Address: Main Bazaar Road, Fortabbas, Punjab, Pakistan
Hours: Monday to Friday 8:00 AM - 5:00 PM
Services: Mail delivery, postal services, money orders
Website: www.pakistanpost.gov.pk
"""
result1 = await test_extract_search_information(sample_results_1, "phone number post office Fortabbas")
print("Test 1 - Post Office Search:")
print(result1)
print("\n" + "="*50 + "\n")
# Test case 2: Business search
sample_results_2 = """
ABC Bank Branch
Contact: (555) 123-4567
Location: 123 Main Street, Anytown, NY 12345
Business Hours: Mon-Fri 9:00 AM - 6:00 PM, Sat 9:00 AM - 2:00 PM
Email: info@abcbank.com
Website: https://www.abcbank.com
Services: Banking, loans, investments
"""
result2 = await test_extract_search_information(sample_results_2, "ABC Bank contact information")
print("Test 2 - Bank Search:")
print(result2)
if __name__ == "__main__":
asyncio.run(main())