#!/usr/bin/env python3 """ Test script for Competitive Intelligence Infrastructure - Phase 2 """ import argparse import json import logging import os import sys from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent / "src")) from competitive_intelligence.competitive_orchestrator import CompetitiveIntelligenceOrchestrator from competitive_intelligence.hvacrschool_competitive_scraper import HVACRSchoolCompetitiveScraper def setup_logging(): """Setup basic logging for the test script.""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), ] ) def test_hvacrschool_scraper(data_dir: Path, logs_dir: Path, limit: int = 5): """Test HVACR School competitive scraper directly.""" print(f"\n=== Testing HVACR School Competitive Scraper ===") scraper = HVACRSchoolCompetitiveScraper(data_dir, logs_dir) print(f"Configured scraper for: {scraper.competitor_name}") print(f"Base URL: {scraper.base_url}") print(f"Proxy enabled: {scraper.competitive_config.use_proxy}") # Test URL discovery print(f"\nDiscovering content URLs (limit: {limit})...") urls = scraper.discover_content_urls(limit) print(f"Discovered {len(urls)} URLs:") for i, url_data in enumerate(urls[:3], 1): # Show first 3 print(f" {i}. {url_data['url']} (method: {url_data.get('discovery_method', 'unknown')})") if len(urls) > 3: print(f" ... and {len(urls) - 3} more") # Test content scraping if urls: test_url = urls[0]['url'] print(f"\nTesting content scraping for: {test_url}") content = scraper.scrape_content_item(test_url) if content: print(f"✓ Successfully scraped content:") print(f" Title: {content.get('title', 'Unknown')[:60]}...") print(f" Word count: {content.get('word_count', 0)}") print(f" Extraction method: {content.get('extraction_method', 'unknown')}") else: print("✗ Failed to scrape content") return urls def test_orchestrator_setup(data_dir: Path, logs_dir: Path): """Test competitive intelligence orchestrator setup.""" print(f"\n=== Testing Competitive Intelligence Orchestrator ===") orchestrator = CompetitiveIntelligenceOrchestrator(data_dir, logs_dir) # Test setup setup_results = orchestrator.test_competitive_setup() print(f"Overall status: {setup_results['overall_status']}") print(f"Test timestamp: {setup_results['test_timestamp']}") for competitor, results in setup_results['test_results'].items(): print(f"\n{competitor.upper()} Configuration:") if results['status'] == 'success': config = results['config'] print(f" ✓ Base URL: {config['base_url']}") print(f" ✓ Directories exist: {config['directories_exist']}") print(f" ✓ Proxy configured: {config['proxy_configured']}") print(f" ✓ Jina API configured: {config['jina_api_configured']}") if 'proxy_working' in config: if config['proxy_working']: print(f" ✓ Proxy working: {config.get('proxy_ip', 'Unknown IP')}") else: print(f" ✗ Proxy issue: {config.get('proxy_error', 'Unknown error')}") else: print(f" ✗ Error: {results['error']}") return setup_results def run_backlog_test(data_dir: Path, logs_dir: Path, limit: int = 5): """Test backlog capture functionality.""" print(f"\n=== Testing Backlog Capture (limit: {limit}) ===") orchestrator = CompetitiveIntelligenceOrchestrator(data_dir, logs_dir) # Run backlog capture results = orchestrator.run_backlog_capture( competitors=['hvacrschool'], limit_per_competitor=limit ) print(f"Operation: {results['operation']}") print(f"Duration: {results['duration_seconds']:.2f} seconds") for competitor, result in results['results'].items(): if result['status'] == 'success': print(f"✓ {competitor}: {result['message']}") else: print(f"✗ {competitor}: {result.get('error', 'Unknown error')}") # Check output files comp_dir = data_dir / "competitive_intelligence" / "hvacrschool" / "backlog" if comp_dir.exists(): files = list(comp_dir.glob("*.md")) if files: latest_file = max(files, key=lambda f: f.stat().st_mtime) print(f"\nLatest backlog file: {latest_file.name}") print(f"File size: {latest_file.stat().st_size} bytes") # Show first few lines try: with open(latest_file, 'r', encoding='utf-8') as f: lines = f.readlines()[:10] print(f"\nFirst few lines:") for line in lines: print(f" {line.rstrip()}") except Exception as e: print(f"Error reading file: {e}") return results def run_incremental_test(data_dir: Path, logs_dir: Path): """Test incremental sync functionality.""" print(f"\n=== Testing Incremental Sync ===") orchestrator = CompetitiveIntelligenceOrchestrator(data_dir, logs_dir) # Run incremental sync results = orchestrator.run_incremental_sync(competitors=['hvacrschool']) print(f"Operation: {results['operation']}") print(f"Duration: {results['duration_seconds']:.2f} seconds") for competitor, result in results['results'].items(): if result['status'] == 'success': print(f"✓ {competitor}: {result['message']}") else: print(f"✗ {competitor}: {result.get('error', 'Unknown error')}") return results def check_status(data_dir: Path, logs_dir: Path): """Check competitive intelligence status.""" print(f"\n=== Checking Competitive Intelligence Status ===") orchestrator = CompetitiveIntelligenceOrchestrator(data_dir, logs_dir) status = orchestrator.get_competitor_status() for competitor, comp_status in status.items(): print(f"\n{competitor.upper()} Status:") if 'error' in comp_status: print(f" ✗ Error: {comp_status['error']}") else: print(f" ✓ Scraper configured: {comp_status.get('scraper_configured', False)}") print(f" ✓ Base URL: {comp_status.get('base_url', 'Unknown')}") print(f" ✓ Proxy enabled: {comp_status.get('proxy_enabled', False)}") if 'last_backlog_capture' in comp_status: print(f" • Last backlog capture: {comp_status['last_backlog_capture'] or 'Never'}") if 'last_incremental_sync' in comp_status: print(f" • Last incremental sync: {comp_status['last_incremental_sync'] or 'Never'}") if 'total_items_captured' in comp_status: print(f" • Total items captured: {comp_status['total_items_captured']}") return status def main(): """Main test function.""" parser = argparse.ArgumentParser(description='Test Competitive Intelligence Infrastructure') parser.add_argument('--test', choices=[ 'setup', 'scraper', 'backlog', 'incremental', 'status', 'all' ], default='setup', help='Type of test to run') parser.add_argument('--limit', type=int, default=5, help='Limit number of items for testing (default: 5)') parser.add_argument('--data-dir', type=Path, default=Path(__file__).parent / 'data', help='Data directory path') parser.add_argument('--logs-dir', type=Path, default=Path(__file__).parent / 'logs', help='Logs directory path') args = parser.parse_args() # Setup setup_logging() print("🔍 HKIA Competitive Intelligence Infrastructure Test") print("=" * 60) print(f"Test type: {args.test}") print(f"Data directory: {args.data_dir}") print(f"Logs directory: {args.logs_dir}") # Ensure directories exist args.data_dir.mkdir(exist_ok=True) args.logs_dir.mkdir(exist_ok=True) # Run tests based on selection if args.test in ['setup', 'all']: test_orchestrator_setup(args.data_dir, args.logs_dir) if args.test in ['scraper', 'all']: test_hvacrschool_scraper(args.data_dir, args.logs_dir, args.limit) if args.test in ['backlog', 'all']: run_backlog_test(args.data_dir, args.logs_dir, args.limit) if args.test in ['incremental', 'all']: run_incremental_test(args.data_dir, args.logs_dir) if args.test in ['status', 'all']: check_status(args.data_dir, args.logs_dir) print(f"\n✅ Test completed: {args.test}") if __name__ == "__main__": main()