159 lines
6.5 KiB
Python
159 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Test script to verify the information extraction functionality
|
||
"""
|
||
|
||
import asyncio
|
||
import re
|
||
|
||
async def test_extract_search_information(search_results: str, query: str) -> str:
|
||
"""Test version of the extract search information function"""
|
||
|
||
try:
|
||
# Initialize extracted information
|
||
extracted = {
|
||
'phones': [],
|
||
'emails': [],
|
||
'addresses': [],
|
||
'websites': [],
|
||
'business_name': '',
|
||
'hours': '',
|
||
'summary': ''
|
||
}
|
||
|
||
# Extract phone numbers (improved patterns for international numbers)
|
||
phone_patterns = [
|
||
r'(\+\d{1,3}[-\.\s]?\d{1,4}[-\.\s]?\d{1,4}[-\.\s]?\d{1,9})', # International format
|
||
r'(\(?[0-9]{3}\)?[-\.\s]?[0-9]{3}[-\.\s]?[0-9]{4})', # US format
|
||
r'(\d{2,4}[-\.\s]?\d{6,8})', # General format
|
||
]
|
||
phones = []
|
||
for pattern in phone_patterns:
|
||
found_phones = re.findall(pattern, search_results)
|
||
phones.extend(found_phones)
|
||
extracted['phones'] = list(set(phones)) # Remove duplicates
|
||
|
||
# Extract email addresses
|
||
email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
|
||
emails = re.findall(email_pattern, search_results)
|
||
extracted['emails'] = list(set(emails))
|
||
|
||
# Extract websites/URLs
|
||
url_pattern = r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'
|
||
websites = re.findall(url_pattern, search_results)
|
||
extracted['websites'] = list(set(websites))
|
||
|
||
# Extract business hours patterns
|
||
hours_patterns = [
|
||
r'((?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))',
|
||
r'(Hours?:?\s*[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))',
|
||
r'(Open:?\s*[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))',
|
||
r'(\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\s*-\s*\d{1,2}:\d{2}\s*(?:AM|PM|am|pm))',
|
||
r'(\d{1,2}\s*(?:AM|PM|am|pm)\s*-\s*\d{1,2}\s*(?:AM|PM|am|pm))'
|
||
]
|
||
for pattern in hours_patterns:
|
||
hours_match = re.search(pattern, search_results, re.IGNORECASE)
|
||
if hours_match:
|
||
extracted['hours'] = hours_match.group(1).strip()
|
||
break
|
||
|
||
# Extract addresses
|
||
address_patterns = [
|
||
r'(\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Circle|Cir|Court|Ct|Place|Pl)[^,]*(?:,\s*[A-Za-z\s]+)*)',
|
||
r'([A-Za-z\s]+,\s*[A-Z]{2}\s+\d{5})', # City, State ZIP
|
||
r'(\d+\s+[A-Za-z0-9\s,.-]+(?:Pakistan|PK))', # Pakistan addresses
|
||
]
|
||
for pattern in address_patterns:
|
||
address_matches = re.findall(pattern, search_results, re.IGNORECASE)
|
||
if address_matches:
|
||
extracted['addresses'] = list(set(address_matches))
|
||
break
|
||
|
||
# Try to identify business name from query and results
|
||
business_keywords = ['post office', 'bank', 'hospital', 'school', 'office', 'center', 'department']
|
||
for keyword in business_keywords:
|
||
if keyword in query.lower():
|
||
# Look for the business name in results
|
||
lines = search_results.split('\n')
|
||
for line in lines[:5]: # Check first few lines
|
||
if keyword in line.lower() and len(line.strip()) < 100:
|
||
extracted['business_name'] = line.strip()
|
||
break
|
||
break
|
||
|
||
# Format the response
|
||
if any([extracted['phones'], extracted['emails'], extracted['websites'], extracted['hours'], extracted['addresses']]):
|
||
response = f"I found information for your search '{query}':\n\n"
|
||
|
||
if extracted['business_name']:
|
||
response += f"🏢 **{extracted['business_name']}**\n\n"
|
||
|
||
if extracted['phones']:
|
||
response += f"📞 **Phone**: {', '.join(extracted['phones'])}\n"
|
||
|
||
if extracted['emails']:
|
||
response += f"📧 **Email**: {', '.join(extracted['emails'])}\n"
|
||
|
||
if extracted['addresses']:
|
||
response += f"📍 **Address**: {', '.join(extracted['addresses'][:2])}\n" # Limit to 2 addresses
|
||
|
||
if extracted['websites']:
|
||
response += f"🌐 **Website**: {', '.join(extracted['websites'][:2])}\n" # Limit to 2 URLs
|
||
|
||
if extracted['hours']:
|
||
response += f"🕒 **Hours**: {extracted['hours']}\n"
|
||
|
||
# Add a summary from the first few lines of results
|
||
lines = search_results.split('\n')
|
||
meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 20 and not line.strip().startswith('http')]
|
||
if meaningful_lines:
|
||
response += f"\nℹ️ **Additional Info**: {meaningful_lines[0][:200]}...\n"
|
||
|
||
response += f"\nWould you like me to help you with anything specific, like getting directions or finding more details?"
|
||
|
||
return response
|
||
|
||
# If no specific information extracted, return original results
|
||
return search_results
|
||
|
||
except Exception as e:
|
||
print(f"Error extracting search information: {e}")
|
||
return search_results
|
||
|
||
# Test with sample search results
|
||
async def main():
|
||
# Test case 1: Post office search
|
||
sample_results_1 = """
|
||
Post Office Fortabbas - Pakistan Post
|
||
Contact Information
|
||
Phone: +92-68-5555123
|
||
Email: fortabbas@pakistanpost.gov.pk
|
||
Address: Main Bazaar Road, Fortabbas, Punjab, Pakistan
|
||
Hours: Monday to Friday 8:00 AM - 5:00 PM
|
||
Services: Mail delivery, postal services, money orders
|
||
Website: www.pakistanpost.gov.pk
|
||
"""
|
||
|
||
result1 = await test_extract_search_information(sample_results_1, "phone number post office Fortabbas")
|
||
print("Test 1 - Post Office Search:")
|
||
print(result1)
|
||
print("\n" + "="*50 + "\n")
|
||
|
||
# Test case 2: Business search
|
||
sample_results_2 = """
|
||
ABC Bank Branch
|
||
Contact: (555) 123-4567
|
||
Location: 123 Main Street, Anytown, NY 12345
|
||
Business Hours: Mon-Fri 9:00 AM - 6:00 PM, Sat 9:00 AM - 2:00 PM
|
||
Email: info@abcbank.com
|
||
Website: https://www.abcbank.com
|
||
Services: Banking, loans, investments
|
||
"""
|
||
|
||
result2 = await test_extract_search_information(sample_results_2, "ABC Bank contact information")
|
||
print("Test 2 - Bank Search:")
|
||
print(result2)
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|