#!/usr/bin/env python3 """ Enhanced YouTube Competitive Intelligence Scraper Phase 2 implementation with centralized quota management, advanced analysis, and scalable architecture. Extends BaseCompetitiveScraper to scrape competitor YouTube channels with comprehensive competitive intelligence. Python Best Practices Applied: - Comprehensive type hints with Protocol and Generic types - Custom exception classes for specific error handling - Resource management with proper context managers - Thread-safe singleton pattern for quota management - Structured logging with contextual information - Input validation and data sanitization """ import os import time import json import logging import contextlib from typing import Any, Dict, List, Optional, Tuple, Union, cast from datetime import datetime, timedelta from pathlib import Path from collections import defaultdict from googleapiclient.discovery import build from googleapiclient.errors import HttpError import threading from .base_competitive_scraper import BaseCompetitiveScraper, CompetitiveConfig from .exceptions import ( YouTubeAPIError, YouTubeChannelNotFoundError, YouTubeVideoNotFoundError, QuotaExceededError, ConfigurationError, DataValidationError, handle_youtube_api_error ) from .types import ( YouTubeVideoItem, CompetitorAnalysis, QuotaState, PublishingAnalysis, ContentAnalysis, EngagementAnalysis, QualityMetrics, Platform, CompetitivePriority, QualityTier ) class YouTubeQuotaManager: """Centralized YouTube API quota management for all competitive scrapers.""" _instance = None _lock = threading.Lock() def __new__(cls): """Singleton pattern for centralized quota management.""" if cls._instance is None: with cls._lock: if cls._instance is None: cls._instance = super().__new__(cls) cls._instance._initialized = False return cls._instance def __init__(self): """Initialize quota manager.""" if getattr(self, '_initialized', False): return self.daily_quota_limit = int(os.getenv('YOUTUBE_COMPETITIVE_QUOTA_LIMIT', '8000')) self.quota_used = 0 self.quota_reset_time = None self.operation_costs = { 'channels_list': 1, 'playlist_items_list': 1, 'videos_list': 1, 'search_list': 100, 'comments_list': 1, 'channel_sections_list': 1 } self._quota_lock = threading.Lock() self._initialized = True # Load quota state from file if exists self._load_quota_state() def _get_quota_state_file(self) -> Path: """Get path to quota state file.""" data_dir = Path(os.getenv('COMPETITIVE_DATA_DIR', 'data')) state_dir = data_dir / '.state' / 'competitive' state_dir.mkdir(parents=True, exist_ok=True) return state_dir / 'youtube_quota_state.json' def _load_quota_state(self): """Load quota state from persistence file.""" try: quota_file = self._get_quota_state_file() if quota_file.exists(): with open(quota_file, 'r') as f: state = json.load(f) # Check if quota should be reset (new day) last_reset = state.get('quota_reset_time') if last_reset: last_reset_dt = datetime.fromisoformat(last_reset) now = datetime.now(last_reset_dt.tzinfo) # Reset quota if it's a new day (Pacific Time for YouTube quota) if now.date() > last_reset_dt.date(): self.quota_used = 0 self.quota_reset_time = now.isoformat() else: self.quota_used = state.get('quota_used', 0) self.quota_reset_time = last_reset else: self._reset_daily_quota() else: self._reset_daily_quota() except (OSError, json.JSONDecodeError, KeyError, ValueError) as e: # Use logging instead of print for better debugging logging.getLogger(__name__).warning(f"Failed to load YouTube quota state: {e}") self._reset_daily_quota() except Exception as e: logging.getLogger(__name__).error(f"Unexpected error loading quota state: {e}") self._reset_daily_quota() def _save_quota_state(self): """Save quota state to persistence file.""" try: quota_file = self._get_quota_state_file() state = { 'quota_used': self.quota_used, 'quota_reset_time': self.quota_reset_time, 'daily_limit': self.daily_quota_limit, 'last_updated': datetime.now().isoformat() } with open(quota_file, 'w') as f: json.dump(state, f, indent=2) except (OSError, json.JSONEncodeError) as e: logging.getLogger(__name__).warning(f"Failed to save YouTube quota state: {e}") except Exception as e: logging.getLogger(__name__).error(f"Unexpected error saving quota state: {e}") def _reset_daily_quota(self): """Reset daily quota tracking.""" import pytz pst = pytz.timezone('America/Los_Angeles') # YouTube quota resets in Pacific Time self.quota_reset_time = datetime.now(pst).isoformat() self.quota_used = 0 def check_and_reserve_quota(self, operation: str, count: int = 1) -> bool: """Check if quota is available and reserve it.""" with self._quota_lock: cost = self.operation_costs.get(operation, 1) * count if self.quota_used + cost > self.daily_quota_limit: return False self.quota_used += cost self._save_quota_state() return True def get_quota_status(self) -> Dict[str, Any]: """Get current quota usage status.""" return { 'quota_used': self.quota_used, 'quota_remaining': self.daily_quota_limit - self.quota_used, 'quota_limit': self.daily_quota_limit, 'quota_percentage': (self.quota_used / self.daily_quota_limit) * 100, 'quota_reset_time': self.quota_reset_time } def release_quota(self, operation: str, count: int = 1): """Release reserved quota (for failed operations).""" with self._quota_lock: cost = self.operation_costs.get(operation, 1) * count self.quota_used = max(0, self.quota_used - cost) self._save_quota_state() class YouTubeCompetitiveScraper(BaseCompetitiveScraper): """YouTube competitive intelligence scraper using YouTube Data API v3.""" # Enhanced competitor channel configurations with competitive intelligence metadata COMPETITOR_CHANNELS = { 'ac_service_tech': { 'handle': '@acservicetech', 'name': 'AC Service Tech', 'url': 'https://www.youtube.com/@acservicetech', 'category': 'educational_technical', 'content_focus': ['troubleshooting', 'repair_techniques', 'field_service'], 'target_audience': 'hvac_technicians', 'competitive_priority': 'high', 'analysis_focus': ['content_gaps', 'technical_depth', 'engagement_patterns'] }, 'refrigeration_mentor': { 'handle': '@RefrigerationMentor', 'name': 'Refrigeration Mentor', 'url': 'https://www.youtube.com/@RefrigerationMentor', 'category': 'educational_specialized', 'content_focus': ['refrigeration_systems', 'commercial_hvac', 'troubleshooting'], 'target_audience': 'refrigeration_specialists', 'competitive_priority': 'high', 'analysis_focus': ['niche_content', 'commercial_focus', 'technical_authority'] }, 'love2hvac': { 'handle': '@Love2HVAC', 'name': 'Love2HVAC', 'url': 'https://www.youtube.com/@Love2HVAC', 'category': 'educational_general', 'content_focus': ['basic_concepts', 'diy_guidance', 'system_explanations'], 'target_audience': 'homeowners_beginners', 'competitive_priority': 'medium', 'analysis_focus': ['accessibility', 'explanation_style', 'beginner_content'] }, 'hvac_tv': { 'handle': '@HVACTV', 'name': 'HVAC TV', 'url': 'https://www.youtube.com/@HVACTV', 'category': 'industry_news', 'content_focus': ['industry_trends', 'product_reviews', 'business_insights'], 'target_audience': 'hvac_professionals', 'competitive_priority': 'medium', 'analysis_focus': ['industry_coverage', 'product_insights', 'business_content'] } } def __init__(self, data_dir: Path, logs_dir: Path, competitor_key: str): """Initialize enhanced YouTube competitive scraper for specific competitor.""" if competitor_key not in self.COMPETITOR_CHANNELS: raise ConfigurationError( f"Unknown YouTube competitor: {competitor_key}", {'available_competitors': list(self.COMPETITOR_CHANNELS.keys())} ) competitor_info = self.COMPETITOR_CHANNELS[competitor_key] # Create competitive configuration with enhanced settings config = CompetitiveConfig( source_name=f"YouTube_{competitor_info['name'].replace(' ', '')}", brand_name="hkia", data_dir=data_dir, logs_dir=logs_dir, competitor_name=competitor_key, base_url=competitor_info['url'], timezone=os.getenv('TIMEZONE', 'America/Halifax'), use_proxy=False, # YouTube API doesn't require proxy request_delay=1.0, # Reduced for API calls backlog_limit=int(os.getenv('YOUTUBE_COMPETITIVE_BACKLOG_LIMIT', '200')) ) super().__init__(config) # Store competitor details with enhanced metadata self.competitor_key = competitor_key self.competitor_info = competitor_info self.channel_handle = competitor_info['handle'] self.competitive_category = competitor_info['category'] self.content_focus = competitor_info['content_focus'] self.target_audience = competitor_info['target_audience'] self.competitive_priority = competitor_info['competitive_priority'] self.analysis_focus = competitor_info['analysis_focus'] # YouTube API setup self.api_key = os.getenv('YOUTUBE_API_KEY') if not self.api_key: raise ConfigurationError( "YouTube API key not configured", {'env_var': 'YOUTUBE_API_KEY'} ) self.youtube = build('youtube', 'v3', developerKey=self.api_key) # Channel metadata storage self.channel_id = None self.uploads_playlist_id = None self.channel_metadata = {} # Centralized quota management self.quota_manager = YouTubeQuotaManager() # Enhanced state management for competitive intelligence self.competitive_state_cache = {} # Initialize channel info self._get_channel_info() # Log comprehensive initialization details self.logger.info(f"Enhanced YouTube competitive scraper initialized for {competitor_info['name']}") self.logger.info(f"Category: {self.competitive_category}, Priority: {self.competitive_priority}") self.logger.info(f"Content Focus: {', '.join(self.content_focus)}") self.logger.info(f"Analysis Focus: {', '.join(self.analysis_focus)}") # Log quota status quota_status = self.quota_manager.get_quota_status() self.logger.info(f"Shared API quota: {quota_status['quota_used']}/{quota_status['quota_limit']} ({quota_status['quota_percentage']:.1f}%)") def _track_quota(self, operation: str, count: int = 1) -> bool: """Track YouTube API quota usage via centralized manager.""" if self.quota_manager.check_and_reserve_quota(operation, count): quota_status = self.quota_manager.get_quota_status() self.logger.debug(f"Reserved quota for {operation}x{count}. Total: {quota_status['quota_used']}/{quota_status['quota_limit']} ({quota_status['quota_percentage']:.1f}%)") return True else: quota_status = self.quota_manager.get_quota_status() self.logger.warning(f"YouTube API quota limit would be exceeded for {operation}x{count}. Current: {quota_status['quota_used']}/{quota_status['quota_limit']}") return False def _release_quota_on_error(self, operation: str, count: int = 1): """Release quota allocation if operation fails.""" self.quota_manager.release_quota(operation, count) self.logger.debug(f"Released quota for failed {operation}x{count}") def get_quota_status(self) -> Dict[str, Any]: """Get current centralized quota status.""" return self.quota_manager.get_quota_status() def _get_channel_info(self) -> bool: """Get enhanced channel information and uploads playlist ID.""" if self.channel_id and self.uploads_playlist_id: return True try: handle = self.channel_handle.replace('@', '') if not self._track_quota('channels_list'): self.logger.warning(f"Cannot get channel info due to quota limit") return False try: # Use forHandle parameter for YouTube Data API v3 response = self.youtube.channels().list( part='snippet,statistics,contentDetails,brandingSettings', forHandle=handle ).execute() if response.get('items'): channel_data = response['items'][0] self.channel_id = channel_data['id'] self.uploads_playlist_id = channel_data['contentDetails']['relatedPlaylists']['uploads'] # Store enhanced channel metadata for competitive analysis snippet = channel_data['snippet'] stats = channel_data.get('statistics', {}) branding = channel_data.get('brandingSettings', {}) self.channel_metadata = { 'title': snippet['title'], 'description': snippet.get('description', '')[:1000] + ('...' if len(snippet.get('description', '')) > 1000 else ''), 'subscriber_count': int(stats.get('subscriberCount', 0)), 'video_count': int(stats.get('videoCount', 0)), 'view_count': int(stats.get('viewCount', 0)), 'published_at': snippet['publishedAt'], 'channel_id': self.channel_id, 'country': snippet.get('country'), 'default_language': snippet.get('defaultLanguage'), 'keywords': branding.get('channel', {}).get('keywords', ''), 'competitor_metadata': { 'competitive_category': self.competitive_category, 'content_focus': self.content_focus, 'target_audience': self.target_audience, 'competitive_priority': self.competitive_priority, 'analysis_focus': self.analysis_focus }, 'analysis_timestamp': datetime.now(self.tz).isoformat() } # Calculate competitive metrics subscriber_count = self.channel_metadata['subscriber_count'] video_count = self.channel_metadata['video_count'] if video_count > 0: avg_views_per_video = self.channel_metadata['view_count'] / video_count self.channel_metadata['avg_views_per_video'] = int(avg_views_per_video) self.logger.info(f"Enhanced channel data acquired: {self.channel_metadata['title']}") self.logger.info(f"Subscribers: {subscriber_count:,}, Videos: {video_count:,}") self.logger.info(f"Total Views: {self.channel_metadata['view_count']:,}") if 'avg_views_per_video' in self.channel_metadata: self.logger.info(f"Avg Views/Video: {self.channel_metadata['avg_views_per_video']:,}") return True else: self.logger.error(f"No channel found for handle {handle}") self._release_quota_on_error('channels_list') return False except HttpError as api_error: self.logger.error(f"YouTube API error getting channel info: {api_error}") self._release_quota_on_error('channels_list') handle_youtube_api_error(api_error, "getting channel info") return False except (ValueError, KeyError, TypeError) as e: self.logger.error(f"Data parsing error getting channel info: {e}") return False except Exception as e: self.logger.error(f"Unexpected error getting channel info: {e}") return False def discover_content_urls(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: """Enhanced video discovery from competitor's YouTube channel with priority handling.""" if not self._get_channel_info(): self.logger.error("Cannot discover content without channel info") return [] # Adjust discovery based on competitive priority discovery_limit = limit or (150 if self.competitive_priority == 'high' else 100) videos = [] next_page_token = None operations_count = 0 try: self.logger.info(f"Starting enhanced content discovery for {self.competitor_info['name']} (limit: {discovery_limit})") while len(videos) < discovery_limit: if not self._track_quota('playlist_items_list'): self.logger.warning("Quota limit reached, stopping discovery early") break try: # Get videos from uploads playlist with enhanced data batch_size = min(50, discovery_limit - len(videos)) response = self.youtube.playlistItems().list( part='snippet,contentDetails,status', playlistId=self.uploads_playlist_id, maxResults=batch_size, pageToken=next_page_token ).execute() operations_count += 1 for item in response.get('items', []): video_id = item['contentDetails']['videoId'] snippet = item['snippet'] status = item.get('status', {}) # Skip private videos if status.get('privacyStatus') == 'private': continue # Parse publish date for competitive analysis try: published_dt = datetime.fromisoformat(snippet['publishedAt'].replace('Z', '+00:00')) days_since_publish = (datetime.now(published_dt.tzinfo) - published_dt).days except: days_since_publish = None video_data = { 'url': f"https://www.youtube.com/watch?v={video_id}", 'video_id': video_id, 'title': snippet['title'], 'published_at': snippet['publishedAt'], 'description': snippet['description'][:500] + ('...' if len(snippet['description']) > 500 else ''), 'thumbnail_url': snippet['thumbnails'].get('maxres', snippet['thumbnails'].get('high', {})).get('url', ''), 'channel_title': snippet['channelTitle'], 'position': snippet.get('position', 0), 'privacy_status': status.get('privacyStatus', 'public'), # Competitive analysis metadata 'days_since_publish': days_since_publish, 'competitor_key': self.competitor_key, 'competitive_priority': self.competitive_priority, 'content_focus_tags': self._analyze_title_for_focus(snippet['title']), 'discovery_timestamp': datetime.now(self.tz).isoformat() } videos.append(video_data) next_page_token = response.get('nextPageToken') if not next_page_token: self.logger.info(f"Reached end of playlist for {self.competitor_info['name']}") break # Rate limiting between API calls time.sleep(0.5) except HttpError as api_error: self.logger.error(f"YouTube API error in discovery batch {operations_count}: {api_error}") self._release_quota_on_error('playlist_items_list') try: handle_youtube_api_error(api_error, f"discovery batch {operations_count}") except QuotaExceededError: self.logger.warning("API quota exceeded, stopping discovery early") break except YouTubeAPIError: # Continue with next batch after API error continue except (ValueError, KeyError, TypeError) as e: self.logger.error(f"Data processing error in content discovery: {e}") except Exception as e: self.logger.error(f"Unexpected error in enhanced content discovery: {e}") # Log discovery results with competitive context self.logger.info(f"Enhanced discovery complete: {len(videos)} videos from {self.competitor_info['name']}") if videos: recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 30] self.logger.info(f"Recent content (30 days): {len(recent_videos)} videos") # Analyze content focus distribution focus_distribution = defaultdict(int) for video in videos: for tag in video.get('content_focus_tags', []): focus_distribution[tag] += 1 if focus_distribution: top_focuses = sorted(focus_distribution.items(), key=lambda x: x[1], reverse=True)[:3] self.logger.info(f"Top content focuses: {', '.join([f'{focus}({count})' for focus, count in top_focuses])}") return videos def _analyze_title_for_focus(self, title: str) -> List[str]: """Analyze video title to identify content focus areas.""" title_lower = title.lower() focus_tags = [] # Define focus keywords based on competitive analysis focus_keywords = { 'troubleshooting': ['troubleshoot', 'problem', 'fix', 'repair', 'diagnose', 'issue', 'error'], 'installation': ['install', 'setup', 'mount', 'connect', 'wiring'], 'maintenance': ['maintain', 'service', 'clean', 'replace', 'check'], 'hvac_systems': ['hvac', 'air conditioner', 'furnace', 'heat pump', 'ductwork'], 'refrigeration': ['refrigerat', 'cooling', 'condenser', 'evaporator', 'compressor'], 'commercial': ['commercial', 'industrial', 'building', 'facility'], 'residential': ['home', 'house', 'residential', 'homeowner'], 'training': ['training', 'learn', 'course', 'education', 'tutorial'], 'tools': ['tool', 'equipment', 'meter', 'gauge'], 'safety': ['safety', 'danger', 'hazard', 'protection'] } for focus, keywords in focus_keywords.items(): if any(keyword in title_lower for keyword in keywords): focus_tags.append(focus) # Add competitive-specific focus tags if any(word in title_lower for word in self.content_focus): for focus_area in self.content_focus: if focus_area not in focus_tags: focus_tags.append(focus_area) return focus_tags[:5] # Limit to top 5 focus areas def scrape_content_item(self, url: str) -> Optional[Dict[str, Any]]: """Enhanced video content scraping with competitive intelligence analysis.""" try: # Extract video ID from URL video_id = None if 'watch?v=' in url: video_id = url.split('watch?v=')[1].split('&')[0] elif 'youtu.be/' in url: video_id = url.split('youtu.be/')[1].split('?')[0] if not video_id: raise DataValidationError( "Invalid YouTube URL format", field="url", value=url ) if not self._track_quota('videos_list'): self.logger.warning("Quota limit reached, skipping video scraping") return None try: # Get comprehensive video details with enhanced parts response = self.youtube.videos().list( part='snippet,statistics,contentDetails,status,topicDetails', id=video_id ).execute() if not response.get('items'): self.logger.warning(f"No video data found for ID: {video_id}") self._release_quota_on_error('videos_list') raise YouTubeVideoNotFoundError(video_id) video_data = response['items'][0] snippet = video_data['snippet'] statistics = video_data.get('statistics', {}) content_details = video_data.get('contentDetails', {}) status = video_data.get('status', {}) topic_details = video_data.get('topicDetails', {}) # Parse and calculate enhanced metrics duration = content_details.get('duration', 'PT0S') duration_seconds = self._parse_duration(duration) # Enhanced date processing published_at = snippet['publishedAt'] try: published_date = datetime.fromisoformat(published_at.replace('Z', '+00:00')) formatted_date = published_date.strftime('%Y-%m-%d %H:%M:%S UTC') days_since_publish = (datetime.now(published_date.tzinfo) - published_date).days except: formatted_date = published_at days_since_publish = None # Calculate competitive engagement metrics view_count = int(statistics.get('viewCount', 0)) like_count = int(statistics.get('likeCount', 0)) comment_count = int(statistics.get('commentCount', 0)) engagement_rate = 0 if view_count > 0: engagement_rate = ((like_count + comment_count) / view_count) * 100 # Analyze competitive positioning content_focus_tags = self._analyze_title_for_focus(snippet['title']) description_focus = self._analyze_description_for_competitive_intel(snippet.get('description', '')) # Calculate content quality score quality_metrics = self._calculate_content_quality_score( title=snippet['title'], description=snippet.get('description', ''), duration_seconds=duration_seconds, tags=snippet.get('tags', []), view_count=view_count, engagement_rate=engagement_rate ) scraped_item = { 'id': video_id, 'url': url, 'title': snippet['title'], 'description': snippet['description'], 'author': snippet['channelTitle'], 'publish_date': formatted_date, 'duration': duration_seconds, 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, 'engagement_rate': round(engagement_rate, 3), 'privacy_status': status.get('privacyStatus', 'public'), 'thumbnail_url': snippet['thumbnails'].get('maxres', snippet['thumbnails'].get('high', {})).get('url', ''), 'tags': snippet.get('tags', []), 'category_id': snippet.get('categoryId'), 'default_language': snippet.get('defaultLanguage'), 'topic_categories': topic_details.get('topicCategories', []), # Enhanced competitive intelligence metadata 'type': 'youtube_video', 'competitor': self.competitor_key, 'competitive_category': self.competitive_category, 'competitive_priority': self.competitive_priority, 'target_audience': self.target_audience, 'content_focus_tags': content_focus_tags, 'description_analysis': description_focus, 'quality_metrics': quality_metrics, 'days_since_publish': days_since_publish, 'capture_timestamp': datetime.now(self.tz).isoformat(), 'extraction_method': 'youtube_data_api_v3_enhanced', # Comprehensive social metrics for competitive analysis 'social_metrics': { 'views': view_count, 'likes': like_count, 'comments': comment_count, 'engagement_rate': engagement_rate, 'views_per_day': round(view_count / max(days_since_publish, 1), 2) if days_since_publish else 0, 'subscriber_engagement': self._estimate_subscriber_engagement(view_count) }, # Content analysis for competitive intelligence 'word_count': len(snippet['description'].split()), 'title_length': len(snippet['title']), 'tag_count': len(snippet.get('tags', [])), 'content_type': self._classify_content_type(snippet['title'], duration_seconds), # Formatted content for markdown output 'content': self._format_competitive_content(snippet, statistics, quality_metrics, content_focus_tags) } # Rate limiting with reduced delay for API calls time.sleep(0.5) return scraped_item except HttpError as api_error: self.logger.error(f"YouTube API error scraping video {url}: {api_error}") self._release_quota_on_error('videos_list') handle_youtube_api_error(api_error, f"scraping video {video_id}") return None except DataValidationError: # Re-raise validation errors raise except YouTubeVideoNotFoundError: # Re-raise not found errors raise except (ValueError, KeyError, TypeError) as e: self.logger.error(f"Data processing error scraping video {url}: {e}") return None except Exception as e: self.logger.error(f"Unexpected error scraping video {url}: {e}") return None def _parse_duration(self, duration_str: str) -> int: """Parse ISO 8601 duration to seconds.""" try: # Remove PT prefix duration_str = duration_str.replace('PT', '') total_seconds = 0 # Parse hours if 'H' in duration_str: hours, duration_str = duration_str.split('H') total_seconds += int(hours) * 3600 # Parse minutes if 'M' in duration_str: minutes, duration_str = duration_str.split('M') total_seconds += int(minutes) * 60 # Parse seconds if 'S' in duration_str: seconds = duration_str.replace('S', '') total_seconds += int(seconds) return total_seconds except: return 0 def _analyze_description_for_competitive_intel(self, description: str) -> Dict[str, Any]: """Analyze video description for competitive intelligence insights.""" if not description: return {} description_lower = description.lower() analysis = { 'length': len(description), 'word_count': len(description.split()), 'contains_links': 'http' in description_lower, 'contains_timestamps': ':' in description and any(char.isdigit() for char in description), 'contains_contact_info': any(term in description_lower for term in ['email', 'phone', 'contact', '@']), 'contains_cta': any(term in description_lower for term in ['subscribe', 'like', 'follow', 'visit', 'check out']), 'mentions_products': any(term in description_lower for term in ['product', 'equipment', 'tool', 'brand']), 'technical_depth': self._assess_technical_depth(description_lower), 'educational_indicators': self._count_educational_indicators(description_lower) } return analysis def _assess_technical_depth(self, text: str) -> str: """Assess the technical depth of content based on description.""" technical_terms = [ 'refrigerant', 'compressor', 'evaporator', 'condenser', 'superheat', 'subcooling', 'pressure', 'temperature', 'cfm', 'btu', 'tonnage', 'efficiency', 'seer', 'troubleshoot', 'diagnostic', 'multimeter', 'manifold', 'gauge' ] technical_count = sum(1 for term in technical_terms if term in text) if technical_count >= 5: return 'advanced' elif technical_count >= 2: return 'intermediate' else: return 'basic' def _count_educational_indicators(self, text: str) -> int: """Count educational indicators in content.""" educational_terms = [ 'learn', 'understand', 'explain', 'demonstrate', 'show', 'teach', 'step', 'guide', 'tutorial', 'tips', 'basics', 'fundamentals' ] return sum(1 for term in educational_terms if term in text) def _calculate_content_quality_score(self, title: str, description: str, duration_seconds: int, tags: List[str], view_count: int, engagement_rate: float) -> Dict[str, Any]: """Calculate comprehensive content quality score for competitive analysis.""" # Title quality (0-25 points) title_score = min(25, len(title) // 4) # Longer titles generally better for SEO if any(word in title.lower() for word in ['how to', 'guide', 'tips', 'tutorial']): title_score += 5 # Description quality (0-25 points) desc_words = len(description.split()) desc_score = min(25, desc_words // 10) # 250+ words = max score # Duration appropriateness (0-20 points) duration_score = 0 if 300 <= duration_seconds <= 1800: # 5-30 minutes is optimal duration_score = 20 elif 180 <= duration_seconds < 300 or 1800 < duration_seconds <= 3600: duration_score = 15 elif duration_seconds > 60: duration_score = 10 # Tag optimization (0-15 points) tag_score = min(15, len(tags) * 2) # Up to 7-8 tags is optimal # Engagement quality (0-15 points) engagement_score = min(15, engagement_rate * 3) # 5% engagement = max score total_score = title_score + desc_score + duration_score + tag_score + engagement_score return { 'total_score': round(total_score, 1), 'max_score': 100, 'percentage': round((total_score / 100) * 100, 1), 'breakdown': { 'title_score': title_score, 'description_score': desc_score, 'duration_score': duration_score, 'tag_score': tag_score, 'engagement_score': round(engagement_score, 1) }, 'quality_tier': self._get_quality_tier(total_score) } def _get_quality_tier(self, score: float) -> str: """Get quality tier based on total score.""" if score >= 80: return 'excellent' elif score >= 65: return 'good' elif score >= 50: return 'average' elif score >= 35: return 'below_average' else: return 'poor' def _estimate_subscriber_engagement(self, view_count: int) -> str: """Estimate subscriber engagement level based on view count relative to channel size.""" if not self.channel_metadata.get('subscriber_count'): return 'unknown' subscriber_count = self.channel_metadata['subscriber_count'] if subscriber_count == 0: return 'new_channel' engagement_ratio = view_count / subscriber_count if engagement_ratio >= 0.3: return 'excellent' elif engagement_ratio >= 0.15: return 'good' elif engagement_ratio >= 0.05: return 'average' else: return 'low' def _classify_content_type(self, title: str, duration_seconds: int) -> str: """Classify content type based on title and duration.""" title_lower = title.lower() # Quick content if duration_seconds < 180: return 'short_tip' # Tutorial indicators if any(word in title_lower for word in ['how to', 'tutorial', 'guide', 'step by step']): if duration_seconds > 600: return 'comprehensive_tutorial' else: return 'quick_tutorial' # Troubleshooting content if any(word in title_lower for word in ['troubleshoot', 'fix', 'repair', 'problem']): return 'troubleshooting' # Review content if any(word in title_lower for word in ['review', 'unbox', 'test']): return 'product_review' # Educational content if any(word in title_lower for word in ['explain', 'basics', 'fundamentals', 'learn']): return 'educational' # Default based on duration if duration_seconds > 1800: return 'long_form' else: return 'standard' def _format_competitive_content(self, snippet: Dict, statistics: Dict, quality_metrics: Dict, content_focus_tags: List[str]) -> str: """Format content with competitive intelligence focus.""" lines = [] lines.append("**Enhanced Video Analysis:**") lines.append("") lines.append(f"**Description:** {snippet['description'][:500]}{'...' if len(snippet['description']) > 500 else ''}") lines.append("") if snippet.get('tags'): lines.append(f"**Tags:** {', '.join(snippet['tags'][:10])}") lines.append("") lines.append("**Competitive Intelligence:**") lines.append(f"- Content Focus: {', '.join(content_focus_tags) if content_focus_tags else 'General'}") lines.append(f"- Quality Score: {quality_metrics['percentage']}% ({quality_metrics['quality_tier']})") lines.append(f"- Engagement Rate: {statistics.get('viewCount', 0) and statistics.get('likeCount', 0)} likes per {statistics.get('viewCount', 0)} views") lines.append("") return "\n".join(lines) def get_competitor_metadata(self) -> Dict[str, Any]: """Get enhanced metadata about the competitor channel.""" quota_status = self.quota_manager.get_quota_status() return { 'competitor_key': self.competitor_key, 'competitor_name': self.competitor_info['name'], 'channel_handle': self.channel_handle, 'channel_url': self.competitor_info['url'], 'channel_metadata': self.channel_metadata, 'competitive_profile': { 'category': self.competitive_category, 'content_focus': self.content_focus, 'target_audience': self.target_audience, 'competitive_priority': self.competitive_priority, 'analysis_focus': self.analysis_focus }, 'api_quota_status': quota_status, 'scraper_version': '2.0_enhanced', 'last_updated': datetime.now(self.tz).isoformat() } def run_competitor_analysis(self) -> Dict[str, Any]: """Run comprehensive competitive analysis with enhanced intelligence.""" self.logger.info(f"Running enhanced YouTube competitor analysis for {self.competitor_info['name']}") try: # Get comprehensive video sample for analysis analysis_limit = 50 if self.competitive_priority == 'high' else 30 recent_videos = self.discover_content_urls(analysis_limit) if not recent_videos: return {'error': 'No recent videos found', 'competitor': self.competitor_key} self.logger.info(f"Analyzing {len(recent_videos)} videos for competitive intelligence") # Comprehensive competitive analysis analysis = { 'competitor': self.competitor_key, 'competitor_name': self.competitor_info['name'], 'competitive_profile': { 'category': self.competitive_category, 'content_focus': self.content_focus, 'target_audience': self.target_audience, 'competitive_priority': self.competitive_priority, 'analysis_focus': self.analysis_focus }, 'sample_size': len(recent_videos), 'channel_metadata': self.channel_metadata, 'publishing_analysis': self._analyze_publishing_patterns(recent_videos), 'content_analysis': self._analyze_enhanced_content_themes(recent_videos), 'engagement_analysis': self._analyze_engagement_patterns(recent_videos), 'competitive_positioning': self._analyze_competitive_positioning(recent_videos), 'content_gaps': self._identify_potential_content_gaps(recent_videos), 'api_quota_status': self.quota_manager.get_quota_status(), 'analysis_timestamp': datetime.now(self.tz).isoformat() } # Log key insights self._log_competitive_insights(analysis) return analysis except Exception as e: self.logger.error(f"Error in enhanced competitor analysis: {e}") return {'error': str(e), 'competitor': self.competitor_key} def _analyze_publishing_patterns(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze publishing frequency and timing patterns.""" try: if not videos: return {} # Parse publication dates pub_dates = [] for video in videos: try: pub_date = datetime.fromisoformat(video['published_at'].replace('Z', '+00:00')) pub_dates.append(pub_date) except: continue if not pub_dates: return {} # Calculate publishing frequency pub_dates.sort() if len(pub_dates) > 1: date_range = (pub_dates[-1] - pub_dates[0]).days frequency = len(pub_dates) / max(date_range, 1) if date_range > 0 else 0 else: frequency = 0 # Analyze publishing days and times weekdays = [d.weekday() for d in pub_dates] # 0=Monday, 6=Sunday hours = [d.hour for d in pub_dates] return { 'total_videos_analyzed': len(pub_dates), 'date_range_days': date_range if len(pub_dates) > 1 else 0, 'average_frequency_per_day': round(frequency, 2), 'most_common_weekday': max(set(weekdays), key=weekdays.count) if weekdays else None, 'most_common_hour': max(set(hours), key=hours.count) if hours else None, 'latest_video_date': pub_dates[-1].isoformat() if pub_dates else None } except Exception as e: self.logger.error(f"Error analyzing publishing patterns: {e}") return {} def _analyze_enhanced_content_themes(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]: """Enhanced content theme analysis with competitive intelligence.""" try: if not videos: return {} # Collect comprehensive text analysis all_text = [] title_words = [] content_focus_distribution = defaultdict(int) content_types = defaultdict(int) for video in videos: title = video.get('title', '').lower() description = video.get('description', '').lower() all_text.append(title + ' ' + description) title_words.extend(title.split()) # Track content focus tags for tag in video.get('content_focus_tags', []): content_focus_distribution[tag] += 1 # Track content types (would be calculated in scraping) content_type = self._classify_content_type(video.get('title', ''), 600) # Default duration content_types[content_type] += 1 # Enhanced keyword analysis word_freq = {} for word in title_words: # Filter out common words but include HVAC-specific terms if (len(word) > 3 and word not in ['hvac', 'with', 'this', 'that', 'from', 'your', 'they', 'have', 'been', 'will'] and not word.isdigit()): word_freq[word] = word_freq.get(word, 0) + 1 # Get top keywords and focus areas top_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15] top_content_focuses = sorted(content_focus_distribution.items(), key=lambda x: x[1], reverse=True)[:10] top_content_types = sorted(content_types.items(), key=lambda x: x[1], reverse=True) return { 'total_videos_analyzed': len(videos), 'top_title_keywords': [{'keyword': k, 'frequency': f, 'percentage': round((f/len(videos))*100, 1)} for k, f in top_keywords], 'content_focus_distribution': [{'focus': f, 'count': c, 'percentage': round((c/len(videos))*100, 1)} for f, c in top_content_focuses], 'content_type_distribution': [{'type': t, 'count': c, 'percentage': round((c/len(videos))*100, 1)} for t, c in top_content_types], 'average_title_length': round(sum(len(v.get('title', '')) for v in videos) / len(videos), 1) if videos else 0, 'videos_with_descriptions': sum(1 for v in videos if v.get('description', '').strip()), 'content_diversity_score': len(content_focus_distribution), # Number of different focus areas 'primary_content_focus': top_content_focuses[0][0] if top_content_focuses else 'general', 'content_strategy_insights': self._analyze_content_strategy(top_content_focuses, top_content_types) } except (ValueError, KeyError, TypeError, ZeroDivisionError) as e: self.logger.error(f"Data processing error analyzing content themes: {e}") return {} except Exception as e: self.logger.error(f"Unexpected error analyzing enhanced content themes: {e}") return {} def _analyze_content_strategy(self, content_focuses: List[Tuple], content_types: List[Tuple]) -> Dict[str, str]: """Analyze content strategy based on focus and type distributions.""" insights = {} if content_focuses: primary_focus = content_focuses[0][0] focus_concentration = content_focuses[0][1] / sum(count for _, count in content_focuses) if focus_concentration > 0.5: insights['focus_strategy'] = f"Highly specialized in {primary_focus} ({focus_concentration*100:.1f}% of content)" elif focus_concentration > 0.3: insights['focus_strategy'] = f"Primarily focused on {primary_focus} with some diversification" else: insights['focus_strategy'] = "Diversified content strategy across multiple focus areas" if content_types: primary_type = content_types[0][0] type_concentration = content_types[0][1] / sum(count for _, count in content_types) if type_concentration > 0.6: insights['content_type_strategy'] = f"Heavily focused on {primary_type} content" else: insights['content_type_strategy'] = "Mixed content type strategy" return insights def _analyze_engagement_patterns(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze engagement patterns for competitive intelligence.""" try: if not videos: return {} # Note: This analysis would be more complete with actual engagement data # For now, we'll analyze what we have from the discovery phase recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 30] older_videos = [v for v in videos if v.get('days_since_publish', 0) > 30] content_focus_engagement = defaultdict(list) for video in videos: for focus in video.get('content_focus_tags', []): content_focus_engagement[focus].append(video) # Calculate average engagement by content focus focus_performance = {} for focus, focus_videos in content_focus_engagement.items(): if len(focus_videos) >= 3: # Only analyze focuses with sufficient data avg_days_old = sum(v.get('days_since_publish', 0) for v in focus_videos) / len(focus_videos) focus_performance[focus] = { 'video_count': len(focus_videos), 'avg_days_since_publish': round(avg_days_old, 1), 'sample_titles': [v.get('title', '')[:50] for v in focus_videos[:3]] } return { 'total_videos_analyzed': len(videos), 'recent_videos_30d': len(recent_videos), 'older_videos': len(older_videos), 'content_focus_performance': focus_performance, 'publishing_consistency': { 'recent_publishing_rate': len(recent_videos) / 30 if recent_videos else 0, 'content_freshness_score': len(recent_videos) / len(videos) if videos else 0 }, 'engagement_insights': self._generate_engagement_insights(recent_videos, content_focus_engagement) } except (ValueError, KeyError, TypeError, ZeroDivisionError) as e: self.logger.error(f"Data processing error analyzing engagement patterns: {e}") return {} except Exception as e: self.logger.error(f"Unexpected error analyzing engagement patterns: {e}") return {} def _generate_engagement_insights(self, recent_videos: List, content_focus_engagement: Dict) -> Dict[str, str]: """Generate insights about engagement patterns.""" insights = {} if recent_videos: recent_rate = len(recent_videos) / 30 if recent_rate >= 1: insights['publishing_frequency'] = f"High activity: ~{recent_rate:.1f} videos per day" elif recent_rate >= 0.2: insights['publishing_frequency'] = f"Regular activity: ~{recent_rate*7:.1f} videos per week" else: insights['publishing_frequency'] = "Infrequent publishing pattern" # Analyze content focus diversity active_focuses = len([f for f, videos in content_focus_engagement.items() if len(videos) >= 2]) if active_focuses >= 5: insights['content_diversity'] = "High content diversity across multiple focus areas" elif active_focuses >= 3: insights['content_diversity'] = "Moderate content diversity" else: insights['content_diversity'] = "Narrow content focus" return insights def _validate_video_data(self, video_data: Dict[str, Any]) -> bool: """Validate video data structure for required fields.""" required_fields = ['id', 'snippet'] return all(field in video_data for field in required_fields) def _sanitize_text_content(self, text: str, max_length: int = 1000) -> str: """Sanitize and truncate text content.""" if not isinstance(text, str): return "" # Remove control characters and excessive whitespace sanitized = ' '.join(text.split()) # Truncate if necessary if len(sanitized) > max_length: sanitized = sanitized[:max_length] + "..." return sanitized @contextlib.contextmanager def _quota_context(self, operation: str, count: int = 1): """Context manager for quota operations with automatic cleanup.""" reserved = False try: if not self._track_quota(operation, count): raise QuotaExceededError( f"Cannot reserve quota for {operation}", quota_used=self.quota_manager.quota_used, quota_limit=self.quota_manager.daily_quota_limit ) reserved = True yield except Exception: if reserved: self._release_quota_on_error(operation, count) raise def cleanup_resources(self) -> None: """Cleanup resources and connections.""" try: # Close any open connections if hasattr(self, 'session') and self.session: self.session.close() # Clear caches self.content_cache.clear() self.competitive_state_cache.clear() self.logger.info(f"Cleaned up YouTube scraper resources for {self.competitor_key}") except Exception as e: self.logger.warning(f"Error during resource cleanup: {e}") def __enter__(self): """Context manager entry.""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit with resource cleanup.""" self.cleanup_resources() def _analyze_competitive_positioning(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze competitive positioning relative to HVAC Know It All.""" try: # Analyze content positioning positioning = { 'content_overlap': self._calculate_content_overlap(videos), 'differentiation_factors': self._identify_differentiation_factors(videos), 'competitive_advantages': self._identify_competitive_advantages(videos), 'potential_threats': self._identify_potential_threats(videos), 'market_positioning': self._assess_market_positioning() } return positioning except (ValueError, KeyError, TypeError, ZeroDivisionError) as e: self.logger.error(f"Data processing error analyzing competitive positioning: {e}") return {} except Exception as e: self.logger.error(f"Unexpected error analyzing competitive positioning: {e}") return {} def _calculate_content_overlap(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]: """Calculate content overlap with HVAC Know It All focus areas.""" hkia_focus_areas = ['troubleshooting', 'hvac_systems', 'maintenance', 'training', 'tools'] overlap_count = defaultdict(int) total_videos = len(videos) for video in videos: video_focuses = video.get('content_focus_tags', []) for focus in video_focuses: if focus in hkia_focus_areas: overlap_count[focus] += 1 overlap_percentage = sum(overlap_count.values()) / total_videos * 100 if total_videos > 0 else 0 return { 'total_overlap_percentage': round(overlap_percentage, 1), 'overlapping_focus_areas': dict(overlap_count), 'direct_competition_level': 'high' if overlap_percentage > 60 else 'medium' if overlap_percentage > 30 else 'low' } def _identify_differentiation_factors(self, videos: List[Dict[str, Any]]) -> List[str]: """Identify key differentiation factors.""" factors = [] # Analyze content focuses that might be different all_focuses = [] for video in videos: all_focuses.extend(video.get('content_focus_tags', [])) focus_dist = defaultdict(int) for focus in all_focuses: focus_dist[focus] += 1 # Look for unique or heavily emphasized areas total_focus_instances = sum(focus_dist.values()) for focus, count in focus_dist.items(): percentage = (count / total_focus_instances) * 100 if percentage > 25: # Major focus area if focus in ['commercial', 'refrigeration', 'safety']: factors.append(f"Strong emphasis on {focus} content ({percentage:.1f}%)") elif focus == 'training': factors.append(f"Heavy focus on training/educational content ({percentage:.1f}%)") # Analyze content types if self.competitive_category == 'educational_specialized': factors.append("Specialized educational approach") elif self.competitive_category == 'industry_news': factors.append("Industry news and business insight focus") return factors def _identify_competitive_advantages(self, videos: List[Dict[str, Any]]) -> List[str]: """Identify potential competitive advantages.""" advantages = [] # Channel size advantage if self.channel_metadata.get('subscriber_count', 0) > 50000: advantages.append(f"Large subscriber base ({self.channel_metadata['subscriber_count']:,} subscribers)") # Publishing frequency recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 30] if len(recent_videos) > 20: advantages.append("High publishing frequency") # Specialization advantage if self.competitive_priority == 'high': advantages.append("High competitive priority in HVAC space") return advantages def _identify_potential_threats(self, videos: List[Dict[str, Any]]) -> List[str]: """Identify potential competitive threats.""" threats = [] # Content quality threats high_quality_videos = sum(1 for v in videos if v.get('content_focus_tags') and len(v['content_focus_tags']) >= 3) if high_quality_videos / len(videos) > 0.7: threats.append("High proportion of well-categorized, focused content") # Rapid content production recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 7] if len(recent_videos) > 5: threats.append("Very active recent publishing (potential to outpace HKIA)") # Specialization threat if self.target_audience in ['hvac_technicians', 'refrigeration_specialists']: threats.append(f"Direct targeting of {self.target_audience}") return threats def _assess_market_positioning(self) -> Dict[str, str]: """Assess overall market positioning.""" positioning = { 'market_segment': self.target_audience, 'content_strategy': self.competitive_category, 'competitive_stance': self.competitive_priority } if self.competitive_priority == 'high': positioning['threat_level'] = 'Direct competitor - monitor closely' else: positioning['threat_level'] = 'Secondary competitor - periodic monitoring' return positioning def _identify_potential_content_gaps(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]: """Identify potential content gaps that HVAC Know It All could exploit.""" try: # Analyze what content areas are underrepresented all_focuses = [] for video in videos: all_focuses.extend(video.get('content_focus_tags', [])) focus_dist = defaultdict(int) for focus in all_focuses: focus_dist[focus] += 1 # Define comprehensive HVAC content areas comprehensive_areas = [ 'troubleshooting', 'installation', 'maintenance', 'hvac_systems', 'refrigeration', 'commercial', 'residential', 'training', 'tools', 'safety' ] gaps = [] underrepresented = [] total_content = len(videos) for area in comprehensive_areas: area_count = focus_dist.get(area, 0) area_percentage = (area_count / total_content) * 100 if total_content > 0 else 0 if area_count == 0: gaps.append(area) elif area_percentage < 10: # Less than 10% of content underrepresented.append({'area': area, 'percentage': round(area_percentage, 1)}) return { 'complete_gaps': gaps, 'underrepresented_areas': underrepresented, 'opportunity_score': len(gaps) + len(underrepresented), 'hkia_opportunities': self._suggest_hkia_opportunities(gaps, underrepresented) } except (ValueError, KeyError, TypeError) as e: self.logger.error(f"Data processing error identifying content gaps: {e}") return {} except Exception as e: self.logger.error(f"Unexpected error identifying content gaps: {e}") return {} def _suggest_hkia_opportunities(self, gaps: List[str], underrepresented: List[Dict]) -> List[str]: """Suggest opportunities for HVAC Know It All based on competitor gaps.""" opportunities = [] high_value_areas = ['troubleshooting', 'training', 'hvac_systems', 'tools'] for gap in gaps: if gap in high_value_areas: opportunities.append(f"Exploit complete gap in {gap} content") for under in underrepresented: if under['area'] in high_value_areas and under['percentage'] < 5: opportunities.append(f"Dominate underrepresented {under['area']} space ({under['percentage']}% of competitor content)") # Specific opportunities based on competitor type if self.competitive_category == 'educational_specialized' and 'residential' in gaps: opportunities.append("Target residential market gap with beginner-friendly content") if self.competitive_category == 'industry_news' and 'hands_on' in gaps: opportunities.append("Focus on practical, hands-on content to differentiate") return opportunities def _log_competitive_insights(self, analysis: Dict[str, Any]): """Log key competitive insights for monitoring.""" try: insights = [] # Publishing insights if 'publishing_analysis' in analysis: pub_freq = analysis['publishing_analysis'].get('average_frequency_per_day', 0) if pub_freq > 0.5: insights.append(f"High publishing frequency: {pub_freq:.1f} videos/day") # Content focus insights if 'content_analysis' in analysis: primary_focus = analysis['content_analysis'].get('primary_content_focus') if primary_focus: insights.append(f"Primary focus: {primary_focus}") # Competitive positioning if 'competitive_positioning' in analysis: overlap = analysis['competitive_positioning'].get('content_overlap', {}).get('total_overlap_percentage', 0) if overlap > 50: insights.append(f"High content overlap: {overlap}% direct competition") # Content gaps if 'content_gaps' in analysis: opportunity_score = analysis['content_gaps'].get('opportunity_score', 0) if opportunity_score > 5: insights.append(f"High opportunity score: {opportunity_score} content gap areas identified") # Log insights if insights: self.logger.info(f"Key competitive insights for {self.competitor_info['name']}:") for insight in insights: self.logger.info(f" • {insight}") except (ValueError, KeyError, TypeError) as e: self.logger.error(f"Data access error logging competitive insights: {e}") except Exception as e: self.logger.error(f"Unexpected error logging competitive insights: {e}") def _analyze_content_themes(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]: """Legacy content theme analysis method - kept for compatibility.""" # Delegate to enhanced method return self._analyze_enhanced_content_themes(videos) def create_youtube_competitive_scrapers(data_dir: Path, logs_dir: Path) -> Dict[str, YouTubeCompetitiveScraper]: """Enhanced factory function to create all YouTube competitive scrapers with comprehensive error handling.""" import logging logger = logging.getLogger(__name__) scrapers = {} # Initialize centralized quota manager first try: quota_manager = YouTubeQuotaManager() quota_status = quota_manager.get_quota_status() logger.info(f"Initialized YouTube quota manager. Status: {quota_status['quota_used']}/{quota_status['quota_limit']} ({quota_status['quota_percentage']:.1f}%)") except Exception as e: logger.error(f"Failed to initialize YouTube quota manager: {e}") return {} # Create scrapers for each competitor successful_scrapers = [] failed_scrapers = [] for competitor_key in YouTubeCompetitiveScraper.COMPETITOR_CHANNELS: competitor_info = YouTubeCompetitiveScraper.COMPETITOR_CHANNELS[competitor_key] try: logger.info(f"Creating YouTube competitive scraper for {competitor_info['name']}...") scraper = YouTubeCompetitiveScraper(data_dir, logs_dir, competitor_key) scraper_key = f"youtube_{competitor_key}" scrapers[scraper_key] = scraper successful_scrapers.append({ 'key': scraper_key, 'name': competitor_info['name'], 'priority': competitor_info['competitive_priority'], 'category': competitor_info['category'] }) logger.info(f"✓ Successfully created YouTube scraper for {competitor_info['name']}") except Exception as e: error_msg = f"Failed to create YouTube scraper for {competitor_key} ({competitor_info.get('name', 'Unknown')}): {e}" logger.error(error_msg) failed_scrapers.append({ 'key': competitor_key, 'name': competitor_info.get('name', 'Unknown'), 'error': str(e) }) # Log comprehensive initialization results logger.info(f"YouTube competitive scrapers initialization complete:") logger.info(f" ✓ Successfully created: {len(successful_scrapers)} scrapers") if successful_scrapers: for scraper in successful_scrapers: logger.info(f" - {scraper['name']} ({scraper['priority']} priority, {scraper['category']})") if failed_scrapers: logger.warning(f" ✗ Failed to create: {len(failed_scrapers)} scrapers") for failed in failed_scrapers: logger.warning(f" - {failed['name']}: {failed['error']}") # Log quota status after initialization try: final_quota_status = quota_manager.get_quota_status() logger.info(f"Final quota status: {final_quota_status['quota_used']}/{final_quota_status['quota_limit']} ({final_quota_status['quota_percentage']:.1f}%)") except Exception as e: logger.warning(f"Could not get final quota status: {e}") return scrapers def create_single_youtube_competitive_scraper(data_dir: Path, logs_dir: Path, competitor_key: str) -> Optional[YouTubeCompetitiveScraper]: """Create a single YouTube competitive scraper for testing or selective use.""" import logging logger = logging.getLogger(__name__) if competitor_key not in YouTubeCompetitiveScraper.COMPETITOR_CHANNELS: logger.error(f"Unknown competitor key: {competitor_key}. Available: {list(YouTubeCompetitiveScraper.COMPETITOR_CHANNELS.keys())}") return None try: competitor_info = YouTubeCompetitiveScraper.COMPETITOR_CHANNELS[competitor_key] logger.info(f"Creating single YouTube competitive scraper for {competitor_info['name']}...") scraper = YouTubeCompetitiveScraper(data_dir, logs_dir, competitor_key) logger.info(f"✓ Successfully created YouTube competitive scraper for {competitor_info['name']}") logger.info(f" Priority: {competitor_info['competitive_priority']}, Category: {competitor_info['category']}") return scraper except ConfigurationError as e: logger.error(f"Configuration error creating YouTube scraper for {competitor_key}: {e}") return None except Exception as e: logger.error(f"Unexpected error creating YouTube competitive scraper for {competitor_key}: {e}") return None