#!/usr/bin/env python3 """ HKIA Competitive Intelligence Runner - Phase 2 Production script for running competitive intelligence operations. """ import os import sys import json import argparse import logging from pathlib import Path from datetime import datetime # Add src to Python path sys.path.insert(0, str(Path(__file__).parent / "src")) from competitive_intelligence.competitive_orchestrator import CompetitiveIntelligenceOrchestrator from competitive_intelligence.exceptions import ( CompetitiveIntelligenceError, ConfigurationError, QuotaExceededError, YouTubeAPIError, InstagramError, RateLimitError ) def setup_logging(verbose: bool = False): """Setup logging for the competitive intelligence runner.""" level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( level=level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), ] ) # Suppress verbose logs from external libraries if not verbose: logging.getLogger('googleapiclient.discovery').setLevel(logging.WARNING) logging.getLogger('urllib3.connectionpool').setLevel(logging.WARNING) def run_integration_tests(orchestrator: CompetitiveIntelligenceOrchestrator, platforms: list) -> dict: """Run integration tests for specified platforms.""" test_results = {'platforms_tested': platforms, 'tests': {}} for platform in platforms: print(f"\n๐Ÿงช Testing {platform} integration...") try: # Test platform status if platform == 'youtube': # Test YouTube scrapers youtube_scrapers = {k: v for k, v in orchestrator.scrapers.items() if k.startswith('youtube_')} test_results['tests'][f'{platform}_scrapers_available'] = len(youtube_scrapers) if youtube_scrapers: # Test one YouTube scraper test_scraper_name = list(youtube_scrapers.keys())[0] scraper = youtube_scrapers[test_scraper_name] # Test basic functionality urls = scraper.discover_content_urls(1) test_results['tests'][f'{platform}_discovery'] = len(urls) > 0 if urls: content = scraper.scrape_content_item(urls[0]['url']) test_results['tests'][f'{platform}_scraping'] = content is not None elif platform == 'instagram': # Test Instagram scrapers instagram_scrapers = {k: v for k, v in orchestrator.scrapers.items() if k.startswith('instagram_')} test_results['tests'][f'{platform}_scrapers_available'] = len(instagram_scrapers) if instagram_scrapers: # Test one Instagram scraper (more carefully due to rate limits) test_scraper_name = list(instagram_scrapers.keys())[0] scraper = instagram_scrapers[test_scraper_name] # Test profile loading only profile = scraper._get_target_profile() test_results['tests'][f'{platform}_profile_access'] = profile is not None # Skip content scraping for Instagram to avoid rate limits test_results['tests'][f'{platform}_discovery'] = 'skipped_rate_limit' test_results['tests'][f'{platform}_scraping'] = 'skipped_rate_limit' except (RateLimitError, QuotaExceededError) as e: test_results['tests'][f'{platform}_rate_limited'] = str(e) except (YouTubeAPIError, InstagramError) as e: test_results['tests'][f'{platform}_platform_error'] = str(e) except Exception as e: test_results['tests'][f'{platform}_error'] = str(e) return test_results def main(): """Main entry point for competitive intelligence operations.""" parser = argparse.ArgumentParser( description='HKIA Competitive Intelligence Runner - Phase 2', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Test setup python run_competitive_intelligence.py --operation test # Run backlog capture (first time setup) python run_competitive_intelligence.py --operation backlog --limit 50 # Run incremental sync (daily operation) python run_competitive_intelligence.py --operation incremental # Run full competitive analysis python run_competitive_intelligence.py --operation analysis # Check status python run_competitive_intelligence.py --operation status # Target specific competitors python run_competitive_intelligence.py --operation incremental --competitors hvacrschool # Social Media Operations (YouTube & Instagram) - Enhanced Phase 2 # Run social media backlog capture with error handling python run_competitive_intelligence.py --operation social-backlog --limit 20 # Run social media incremental sync python run_competitive_intelligence.py --operation social-incremental # Platform-specific operations with rate limit handling python run_competitive_intelligence.py --operation social-backlog --platforms youtube --limit 30 python run_competitive_intelligence.py --operation social-incremental --platforms instagram # Platform analysis with enhanced error reporting python run_competitive_intelligence.py --operation platform-analysis --platforms youtube python run_competitive_intelligence.py --operation platform-analysis --platforms instagram # Enhanced competitor listing with metadata python run_competitive_intelligence.py --operation list-competitors # Test enhanced integration python run_competitive_intelligence.py --operation test-integration --platforms youtube instagram """ ) parser.add_argument( '--operation', choices=['test', 'backlog', 'incremental', 'analysis', 'status', 'social-backlog', 'social-incremental', 'platform-analysis', 'list-competitors', 'test-integration'], required=True, help='Competitive intelligence operation to run (enhanced Phase 2 support)' ) parser.add_argument( '--competitors', nargs='+', help='Specific competitors to target (default: all configured)' ) parser.add_argument( '--limit', type=int, help='Limit number of items for backlog capture (default: 100)' ) parser.add_argument( '--data-dir', type=Path, help='Data directory path (default: ./data)' ) parser.add_argument( '--logs-dir', type=Path, help='Logs directory path (default: ./logs)' ) parser.add_argument( '--verbose', action='store_true', help='Enable verbose logging' ) parser.add_argument( '--platforms', nargs='+', choices=['youtube', 'instagram'], help='Target specific platforms for social media operations' ) parser.add_argument( '--output-format', choices=['json', 'summary'], default='summary', help='Output format (default: summary)' ) args = parser.parse_args() # Setup logging setup_logging(args.verbose) # Default directories data_dir = args.data_dir or Path("data") logs_dir = args.logs_dir or Path("logs") # Ensure directories exist data_dir.mkdir(exist_ok=True) logs_dir.mkdir(exist_ok=True) print("๐Ÿ” HKIA Competitive Intelligence - Phase 2") print("=" * 50) print(f"Operation: {args.operation}") print(f"Data directory: {data_dir}") print(f"Logs directory: {logs_dir}") if args.competitors: print(f"Competitors: {', '.join(args.competitors)}") if args.platforms: print(f"Platforms: {', '.join(args.platforms)}") if args.limit: print(f"Limit: {args.limit}") print() # Initialize competitive intelligence orchestrator with enhanced error handling try: orchestrator = CompetitiveIntelligenceOrchestrator(data_dir, logs_dir) except ConfigurationError as e: print(f"โŒ Configuration Error: {e.message}") if e.details: print(f" Details: {e.details}") sys.exit(1) except CompetitiveIntelligenceError as e: print(f"โŒ Competitive Intelligence Error: {e.message}") sys.exit(1) except Exception as e: print(f"โŒ Unexpected initialization error: {e}") logging.exception("Unexpected error during orchestrator initialization") sys.exit(1) # Execute operation start_time = datetime.now() results = None try: if args.operation == 'test': print("๐Ÿงช Testing competitive intelligence setup...") results = orchestrator.test_competitive_setup() elif args.operation == 'backlog': limit = args.limit or 100 print(f"๐Ÿ“ฆ Running backlog capture (limit: {limit})...") results = orchestrator.run_backlog_capture(args.competitors, limit) elif args.operation == 'incremental': print("๐Ÿ”„ Running incremental sync...") results = orchestrator.run_incremental_sync(args.competitors) elif args.operation == 'analysis': print("๐Ÿ“Š Running competitive analysis...") results = orchestrator.run_competitive_analysis(args.competitors) elif args.operation == 'status': print("๐Ÿ“‹ Checking competitive intelligence status...") competitor = args.competitors[0] if args.competitors else None results = orchestrator.get_competitor_status(competitor) elif args.operation == 'social-backlog': limit = args.limit or 20 # Smaller default for social media print(f"๐Ÿ“ฑ Running social media backlog capture (limit: {limit})...") results = orchestrator.run_social_media_backlog(args.platforms, limit) elif args.operation == 'social-incremental': print("๐Ÿ“ฑ Running social media incremental sync...") results = orchestrator.run_social_media_incremental(args.platforms) elif args.operation == 'platform-analysis': if not args.platforms or len(args.platforms) != 1: print("โŒ Platform analysis requires exactly one platform (--platforms youtube or --platforms instagram)") sys.exit(1) platform = args.platforms[0] print(f"๐Ÿ“Š Running {platform} competitive analysis...") results = orchestrator.run_platform_analysis(platform) elif args.operation == 'list-competitors': print("๐Ÿ“ Listing available competitors...") results = orchestrator.list_available_competitors() elif args.operation == 'test-integration': print("๐Ÿงช Testing Phase 2 social media integration...") # Run enhanced integration tests results = run_integration_tests(orchestrator, args.platforms or ['youtube', 'instagram']) except ConfigurationError as e: print(f"โŒ Configuration Error: {e.message}") if e.details: print(f" Details: {e.details}") sys.exit(1) except QuotaExceededError as e: print(f"โŒ API Quota Exceeded: {e.message}") print(f" Quota used: {e.quota_used}/{e.quota_limit}") if e.reset_time: print(f" Reset time: {e.reset_time}") sys.exit(1) except RateLimitError as e: print(f"โŒ Rate Limit Exceeded: {e.message}") if e.retry_after: print(f" Retry after: {e.retry_after} seconds") sys.exit(1) except (YouTubeAPIError, InstagramError) as e: print(f"โŒ Platform API Error: {e.message}") sys.exit(1) except CompetitiveIntelligenceError as e: print(f"โŒ Competitive Intelligence Error: {e.message}") sys.exit(1) except Exception as e: print(f"โŒ Unexpected operation error: {e}") logging.exception("Unexpected error during operation execution") sys.exit(1) # Calculate duration end_time = datetime.now() duration = end_time - start_time # Output results print(f"\nโฑ๏ธ Operation completed in {duration.total_seconds():.2f} seconds") if args.output_format == 'json': print("\n๐Ÿ“„ Full Results:") print(json.dumps(results, indent=2, default=str)) else: print_summary(args.operation, results) # Determine exit code exit_code = determine_exit_code(args.operation, results) sys.exit(exit_code) def print_summary(operation: str, results: dict): """Print a human-readable summary of results.""" print(f"\n๐Ÿ“‹ {operation.title()} Summary:") print("-" * 30) if operation == 'test': overall_status = results.get('overall_status', 'unknown') print(f"Overall Status: {'โœ…' if overall_status == 'operational' else 'โŒ'} {overall_status}") for competitor, test_result in results.get('test_results', {}).items(): status = test_result.get('status', 'unknown') print(f"\n{competitor.upper()}:") if status == 'success': config = test_result.get('config', {}) print(f" โœ… Configuration: OK") print(f" ๐ŸŒ Base URL: {config.get('base_url', 'Unknown')}") print(f" ๐Ÿ”’ Proxy: {'โœ…' if config.get('proxy_configured') else 'โŒ'}") print(f" ๐Ÿค– Jina AI: {'โœ…' if config.get('jina_api_configured') else 'โŒ'}") print(f" ๐Ÿ“ Directories: {'โœ…' if config.get('directories_exist') else 'โŒ'}") if config.get('proxy_working'): print(f" ๐ŸŒ Proxy IP: {config.get('proxy_ip', 'Unknown')}") elif 'proxy_working' in config: print(f" โš ๏ธ Proxy Issue: {config.get('proxy_error', 'Unknown')}") else: print(f" โŒ Error: {test_result.get('error', 'Unknown')}") elif operation in ['backlog', 'incremental', 'social-backlog', 'social-incremental']: operation_results = results.get('results', {}) for competitor, result in operation_results.items(): status = result.get('status', 'unknown') error_type = result.get('error_type', '') # Enhanced status icons and messages if status == 'success': icon = 'โœ…' message = result.get('message', 'Completed successfully') if 'limit_used' in result: message += f" (limit: {result['limit_used']})" elif status == 'rate_limited': icon = 'โณ' message = f"Rate limited: {result.get('error', 'Unknown')}" if result.get('retry_recommended'): message += " (retry recommended)" elif status == 'platform_error': icon = '๐Ÿ™…' message = f"Platform error ({error_type}): {result.get('error', 'Unknown')}" else: icon = 'โŒ' message = f"Error ({error_type}): {result.get('error', 'Unknown')}" print(f"{icon} {competitor}: {message}") if 'duration_seconds' in results: print(f"\nโฑ๏ธ Total Duration: {results['duration_seconds']:.2f} seconds") # Show scrapers involved for social media operations if operation.startswith('social-') and 'scrapers' in results: print(f"๐Ÿ“ฑ Scrapers: {', '.join(results['scrapers'])}") elif operation == 'analysis': sync_results = results.get('sync_results', {}) print("๐Ÿ“ฅ Sync Results:") for competitor, result in sync_results.get('results', {}).items(): status = result.get('status', 'unknown') icon = 'โœ…' if status == 'success' else 'โŒ' print(f" {icon} {competitor}: {result.get('message', result.get('error', 'Unknown'))}") analysis_results = results.get('analysis_results', {}) print(f"\n๐Ÿ“Š Analysis: {analysis_results.get('status', 'Unknown')}") if 'message' in analysis_results: print(f" โ„น๏ธ {analysis_results['message']}") elif operation == 'status': for competitor, status_info in results.items(): if 'error' in status_info: print(f"โŒ {competitor}: {status_info['error']}") else: print(f"\n{competitor.upper()} Status:") print(f" ๐Ÿ”ง Configured: {'โœ…' if status_info.get('scraper_configured') else 'โŒ'}") print(f" ๐ŸŒ Base URL: {status_info.get('base_url', 'Unknown')}") print(f" ๐Ÿ”’ Proxy: {'โœ…' if status_info.get('proxy_enabled') else 'โŒ'}") last_backlog = status_info.get('last_backlog_capture') last_sync = status_info.get('last_incremental_sync') total_items = status_info.get('total_items_captured', 0) print(f" ๐Ÿ“ฆ Last Backlog: {last_backlog or 'Never'}") print(f" ๐Ÿ”„ Last Sync: {last_sync or 'Never'}") print(f" ๐Ÿ“Š Total Items: {total_items}") elif operation == 'platform-analysis': platform = results.get('platform', 'unknown') print(f"๐Ÿ“Š {platform.title()} Analysis Results:") for scraper_name, result in results.get('results', {}).items(): status = result.get('status', 'unknown') error_type = result.get('error_type', '') # Enhanced status handling if status == 'success': icon = 'โœ…' elif status == 'rate_limited': icon = 'โณ' elif status == 'platform_error': icon = '๐Ÿ™…' elif status == 'not_supported': icon = 'โ„น๏ธ' else: icon = 'โŒ' print(f"\n{icon} {scraper_name}:") if status == 'success' and 'analysis' in result: analysis = result['analysis'] competitor_name = analysis.get('competitor_name', scraper_name) total_items = analysis.get('total_recent_videos') or analysis.get('total_recent_posts', 0) print(f" ๐Ÿ“ˆ Competitor: {competitor_name}") print(f" ๐Ÿ“Š Recent Items: {total_items}") # Platform-specific details if platform == 'youtube': if 'channel_metadata' in analysis: metadata = analysis['channel_metadata'] print(f" ๐Ÿ‘ฅ Subscribers: {metadata.get('subscriber_count', 'Unknown'):,}") print(f" ๐ŸŽฅ Total Videos: {metadata.get('video_count', 'Unknown'):,}") elif platform == 'instagram': if 'profile_metadata' in analysis: metadata = analysis['profile_metadata'] print(f" ๐Ÿ‘ฅ Followers: {metadata.get('followers', 'Unknown'):,}") print(f" ๐Ÿ“ธ Total Posts: {metadata.get('posts_count', 'Unknown'):,}") # Publishing analysis if 'publishing_analysis' in analysis or 'posting_analysis' in analysis: pub_analysis = analysis.get('publishing_analysis') or analysis.get('posting_analysis', {}) frequency = pub_analysis.get('average_frequency_per_day') or pub_analysis.get('average_posts_per_day', 0) print(f" ๐Ÿ“… Posts per day: {frequency}") elif status in ['error', 'platform_error']: error_msg = result.get('error', 'Unknown') error_type = result.get('error_type', '') if error_type: print(f" โŒ Error ({error_type}): {error_msg}") else: print(f" โŒ Error: {error_msg}") elif status == 'rate_limited': print(f" โณ Rate limited: {result.get('error', 'Unknown')}") if result.get('retry_recommended'): print(f" โ„น๏ธ Retry recommended") elif status == 'not_supported': print(f" โ„น๏ธ Analysis not supported") elif operation == 'list-competitors': print("๐Ÿ“ Available Competitors by Platform:") by_platform = results.get('by_platform', {}) total = results.get('total_scrapers', 0) print(f"\nTotal Scrapers: {total}") for platform, competitors in by_platform.items(): if competitors: platform_icon = '๐ŸŽฅ' if platform == 'youtube' else '๐Ÿ“ฑ' if platform == 'instagram' else '๐Ÿ’ป' print(f"\n{platform_icon} {platform.upper()}: ({len(competitors)} scrapers)") for competitor in competitors: print(f" โ€ข {competitor}") else: print(f"\n{platform.upper()}: No scrapers available") elif operation == 'test-integration': print("๐Ÿงช Integration Test Results:") platforms_tested = results.get('platforms_tested', []) tests = results.get('tests', {}) print(f"\nPlatforms tested: {', '.join(platforms_tested)}") for test_name, test_result in tests.items(): if isinstance(test_result, bool): icon = 'โœ…' if test_result else 'โŒ' print(f"{icon} {test_name}: {'PASSED' if test_result else 'FAILED'}") elif isinstance(test_result, int): print(f"๐Ÿ“Š {test_name}: {test_result}") elif test_result == 'skipped_rate_limit': print(f"โณ {test_name}: Skipped (rate limit protection)") else: print(f"โ„น๏ธ {test_name}: {test_result}") def determine_exit_code(operation: str, results: dict) -> int: """Determine appropriate exit code based on operation and results with enhanced error categorization.""" if operation == 'test': return 0 if results.get('overall_status') == 'operational' else 1 elif operation in ['backlog', 'incremental', 'social-backlog', 'social-incremental']: operation_results = results.get('results', {}) # Consider rate_limited as soft failure (exit code 2) critical_failed = any(r.get('status') in ['error', 'platform_error'] for r in operation_results.values()) rate_limited = any(r.get('status') == 'rate_limited' for r in operation_results.values()) if critical_failed: return 1 elif rate_limited: return 2 # Special exit code for rate limiting else: return 0 elif operation == 'platform-analysis': platform_results = results.get('results', {}) critical_failed = any(r.get('status') in ['error', 'platform_error'] for r in platform_results.values()) rate_limited = any(r.get('status') == 'rate_limited' for r in platform_results.values()) if critical_failed: return 1 elif rate_limited: return 2 else: return 0 elif operation == 'test-integration': tests = results.get('tests', {}) failed_tests = [k for k, v in tests.items() if isinstance(v, bool) and not v] return 1 if failed_tests else 0 elif operation == 'list-competitors': return 0 # This operation always succeeds elif operation == 'analysis': sync_results = results.get('sync_results', {}).get('results', {}) sync_failed = any(r.get('status') not in ['success', 'rate_limited'] for r in sync_results.values()) return 1 if sync_failed else 0 elif operation == 'status': has_errors = any('error' in status for status in results.values()) return 1 if has_errors else 0 return 0 if __name__ == "__main__": main()