#!/usr/bin/env python3 """ Test script to verify the information extraction functionality """ import asyncio import re async def test_extract_search_information(search_results: str, query: str) -> str: """Test version of the extract search information function""" try: # Initialize extracted information extracted = { 'phones': [], 'emails': [], 'addresses': [], 'websites': [], 'business_name': '', 'hours': '', 'summary': '' } # Extract phone numbers (improved patterns for international numbers) phone_patterns = [ r'(\+\d{1,3}[-\.\s]?\d{1,4}[-\.\s]?\d{1,4}[-\.\s]?\d{1,9})', # International format r'(\(?[0-9]{3}\)?[-\.\s]?[0-9]{3}[-\.\s]?[0-9]{4})', # US format r'(\d{2,4}[-\.\s]?\d{6,8})', # General format ] phones = [] for pattern in phone_patterns: found_phones = re.findall(pattern, search_results) phones.extend(found_phones) extracted['phones'] = list(set(phones)) # Remove duplicates # Extract email addresses email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})' emails = re.findall(email_pattern, search_results) extracted['emails'] = list(set(emails)) # Extract websites/URLs url_pattern = r'(https?://[^\s<>"]+|www\.[^\s<>"]+)' websites = re.findall(url_pattern, search_results) extracted['websites'] = list(set(websites)) # Extract business hours patterns hours_patterns = [ r'((?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))', r'(Hours?:?\s*[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))', r'(Open:?\s*[^.]*?(?:\d{1,2}:\d{2}|\d{1,2}\s*(?:AM|PM|am|pm)))', r'(\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\s*-\s*\d{1,2}:\d{2}\s*(?:AM|PM|am|pm))', r'(\d{1,2}\s*(?:AM|PM|am|pm)\s*-\s*\d{1,2}\s*(?:AM|PM|am|pm))' ] for pattern in hours_patterns: hours_match = re.search(pattern, search_results, re.IGNORECASE) if hours_match: extracted['hours'] = hours_match.group(1).strip() break # Extract addresses address_patterns = [ r'(\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Circle|Cir|Court|Ct|Place|Pl)[^,]*(?:,\s*[A-Za-z\s]+)*)', r'([A-Za-z\s]+,\s*[A-Z]{2}\s+\d{5})', # City, State ZIP r'(\d+\s+[A-Za-z0-9\s,.-]+(?:Pakistan|PK))', # Pakistan addresses ] for pattern in address_patterns: address_matches = re.findall(pattern, search_results, re.IGNORECASE) if address_matches: extracted['addresses'] = list(set(address_matches)) break # Try to identify business name from query and results business_keywords = ['post office', 'bank', 'hospital', 'school', 'office', 'center', 'department'] for keyword in business_keywords: if keyword in query.lower(): # Look for the business name in results lines = search_results.split('\n') for line in lines[:5]: # Check first few lines if keyword in line.lower() and len(line.strip()) < 100: extracted['business_name'] = line.strip() break break # Format the response if any([extracted['phones'], extracted['emails'], extracted['websites'], extracted['hours'], extracted['addresses']]): response = f"I found information for your search '{query}':\n\n" if extracted['business_name']: response += f"šŸ¢ **{extracted['business_name']}**\n\n" if extracted['phones']: response += f"šŸ“ž **Phone**: {', '.join(extracted['phones'])}\n" if extracted['emails']: response += f"šŸ“§ **Email**: {', '.join(extracted['emails'])}\n" if extracted['addresses']: response += f"šŸ“ **Address**: {', '.join(extracted['addresses'][:2])}\n" # Limit to 2 addresses if extracted['websites']: response += f"🌐 **Website**: {', '.join(extracted['websites'][:2])}\n" # Limit to 2 URLs if extracted['hours']: response += f"šŸ•’ **Hours**: {extracted['hours']}\n" # Add a summary from the first few lines of results lines = search_results.split('\n') meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 20 and not line.strip().startswith('http')] if meaningful_lines: response += f"\nā„¹ļø **Additional Info**: {meaningful_lines[0][:200]}...\n" response += f"\nWould you like me to help you with anything specific, like getting directions or finding more details?" return response # If no specific information extracted, return original results return search_results except Exception as e: print(f"Error extracting search information: {e}") return search_results # Test with sample search results async def main(): # Test case 1: Post office search sample_results_1 = """ Post Office Fortabbas - Pakistan Post Contact Information Phone: +92-68-5555123 Email: fortabbas@pakistanpost.gov.pk Address: Main Bazaar Road, Fortabbas, Punjab, Pakistan Hours: Monday to Friday 8:00 AM - 5:00 PM Services: Mail delivery, postal services, money orders Website: www.pakistanpost.gov.pk """ result1 = await test_extract_search_information(sample_results_1, "phone number post office Fortabbas") print("Test 1 - Post Office Search:") print(result1) print("\n" + "="*50 + "\n") # Test case 2: Business search sample_results_2 = """ ABC Bank Branch Contact: (555) 123-4567 Location: 123 Main Street, Anytown, NY 12345 Business Hours: Mon-Fri 9:00 AM - 6:00 PM, Sat 9:00 AM - 2:00 PM Email: info@abcbank.com Website: https://www.abcbank.com Services: Banking, loans, investments """ result2 = await test_extract_search_information(sample_results_2, "ABC Bank contact information") print("Test 2 - Bank Search:") print(result2) if __name__ == "__main__": asyncio.run(main())