hvac-kia-content/src/competitive_intelligence/youtube_competitive_scraper.py

#!/usr/bin/env python3
"""
Enhanced YouTube Competitive Intelligence Scraper
Phase 2 implementation with centralized quota management, advanced analysis, and scalable architecture.
Extends BaseCompetitiveScraper to scrape competitor YouTube channels with comprehensive competitive intelligence.

Python Best Practices Applied:
- Comprehensive type hints with Protocol and Generic types
- Custom exception classes for specific error handling
- Resource management with proper context managers
- Thread-safe singleton pattern for quota management
- Structured logging with contextual information
- Input validation and data sanitization
"""

import os
import time
import json
import logging
import contextlib
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import threading

from .base_competitive_scraper import BaseCompetitiveScraper, CompetitiveConfig
from .exceptions import (
    YouTubeAPIError, YouTubeChannelNotFoundError, YouTubeVideoNotFoundError,
    QuotaExceededError, ConfigurationError, DataValidationError,
    handle_youtube_api_error
)
from .types import (
    YouTubeVideoItem, CompetitorAnalysis, QuotaState, PublishingAnalysis,
    ContentAnalysis, EngagementAnalysis, QualityMetrics, Platform,
    CompetitivePriority, QualityTier
)


class YouTubeQuotaManager:
    """Centralized YouTube API quota management for all competitive scrapers."""

    _instance = None
    _lock = threading.Lock()

    def __new__(cls):
        """Singleton pattern for centralized quota management."""
        if cls._instance is None:
            with cls._lock:
                if cls._instance is None:
                    cls._instance = super().__new__(cls)
                    cls._instance._initialized = False
        return cls._instance

    def __init__(self):
        """Initialize quota manager."""
        if getattr(self, '_initialized', False):
            return

        self.daily_quota_limit = int(os.getenv('YOUTUBE_COMPETITIVE_QUOTA_LIMIT', '8000'))
        self.quota_used = 0
        self.quota_reset_time = None
        self.operation_costs = {
            'channels_list': 1,
            'playlist_items_list': 1,
            'videos_list': 1,
            'search_list': 100,
            'comments_list': 1,
            'channel_sections_list': 1
        }
        self._quota_lock = threading.Lock()
        self._initialized = True

        # Load quota state from file if exists
        self._load_quota_state()

    def _get_quota_state_file(self) -> Path:
        """Get path to quota state file."""
        data_dir = Path(os.getenv('COMPETITIVE_DATA_DIR', 'data'))
        state_dir = data_dir / '.state' / 'competitive'
        state_dir.mkdir(parents=True, exist_ok=True)
        return state_dir / 'youtube_quota_state.json'

    def _load_quota_state(self):
        """Load quota state from persistence file."""
        try:
            quota_file = self._get_quota_state_file()
            if quota_file.exists():
                with open(quota_file, 'r') as f:
                    state = json.load(f)

                # Check if quota should be reset (new day)
                last_reset = state.get('quota_reset_time')
                if last_reset:
                    last_reset_dt = datetime.fromisoformat(last_reset)
                    now = datetime.now(last_reset_dt.tzinfo)

                    # Reset quota if it's a new day (Pacific Time for YouTube quota)
                    if now.date() > last_reset_dt.date():
                        self.quota_used = 0
                        self.quota_reset_time = now.isoformat()
                    else:
                        self.quota_used = state.get('quota_used', 0)
                        self.quota_reset_time = last_reset
                else:
                    self._reset_daily_quota()
            else:
                self._reset_daily_quota()

        except (OSError, json.JSONDecodeError, KeyError, ValueError) as e:
            # Use logging instead of print for better debugging
            logging.getLogger(__name__).warning(f"Failed to load YouTube quota state: {e}")
            self._reset_daily_quota()
        except Exception as e:
            logging.getLogger(__name__).error(f"Unexpected error loading quota state: {e}")
            self._reset_daily_quota()

    def _save_quota_state(self):
        """Save quota state to persistence file."""
        try:
            quota_file = self._get_quota_state_file()
            state = {
                'quota_used': self.quota_used,
                'quota_reset_time': self.quota_reset_time,
                'daily_limit': self.daily_quota_limit,
                'last_updated': datetime.now().isoformat()
            }

            with open(quota_file, 'w') as f:
                json.dump(state, f, indent=2)
        except (OSError, json.JSONEncodeError) as e:
            logging.getLogger(__name__).warning(f"Failed to save YouTube quota state: {e}")
        except Exception as e:
            logging.getLogger(__name__).error(f"Unexpected error saving quota state: {e}")

    def _reset_daily_quota(self):
        """Reset daily quota tracking."""
        import pytz
        pst = pytz.timezone('America/Los_Angeles')  # YouTube quota resets in Pacific Time
        self.quota_reset_time = datetime.now(pst).isoformat()
        self.quota_used = 0

    def check_and_reserve_quota(self, operation: str, count: int = 1) -> bool:
        """Check if quota is available and reserve it."""
        with self._quota_lock:
            cost = self.operation_costs.get(operation, 1) * count

            if self.quota_used + cost > self.daily_quota_limit:
                return False

            self.quota_used += cost
            self._save_quota_state()
            return True

    def get_quota_status(self) -> Dict[str, Any]:
        """Get current quota usage status."""
        return {
            'quota_used': self.quota_used,
            'quota_remaining': self.daily_quota_limit - self.quota_used,
            'quota_limit': self.daily_quota_limit,
            'quota_percentage': (self.quota_used / self.daily_quota_limit) * 100,
            'quota_reset_time': self.quota_reset_time
        }

    def release_quota(self, operation: str, count: int = 1):
        """Release reserved quota (for failed operations)."""
        with self._quota_lock:
            cost = self.operation_costs.get(operation, 1) * count
            self.quota_used = max(0, self.quota_used - cost)
            self._save_quota_state()


class YouTubeCompetitiveScraper(BaseCompetitiveScraper):
    """YouTube competitive intelligence scraper using YouTube Data API v3."""

    # Enhanced competitor channel configurations with competitive intelligence metadata
    COMPETITOR_CHANNELS = {
        'ac_service_tech': {
            'handle': '@acservicetech',
            'name': 'AC Service Tech',
            'url': 'https://www.youtube.com/@acservicetech',
            'category': 'educational_technical',
            'content_focus': ['troubleshooting', 'repair_techniques', 'field_service'],
            'target_audience': 'hvac_technicians',
            'competitive_priority': 'high',
            'analysis_focus': ['content_gaps', 'technical_depth', 'engagement_patterns']
        },
        'refrigeration_mentor': {
            'handle': '@RefrigerationMentor',
            'name': 'Refrigeration Mentor',
            'url': 'https://www.youtube.com/@RefrigerationMentor',
            'category': 'educational_specialized',
            'content_focus': ['refrigeration_systems', 'commercial_hvac', 'troubleshooting'],
            'target_audience': 'refrigeration_specialists',
            'competitive_priority': 'high',
            'analysis_focus': ['niche_content', 'commercial_focus', 'technical_authority']
        },
        'love2hvac': {
            'handle': '@Love2HVAC',
            'name': 'Love2HVAC',
            'url': 'https://www.youtube.com/@Love2HVAC',
            'category': 'educational_general',
            'content_focus': ['basic_concepts', 'diy_guidance', 'system_explanations'],
            'target_audience': 'homeowners_beginners',
            'competitive_priority': 'medium',
            'analysis_focus': ['accessibility', 'explanation_style', 'beginner_content']
        },
        'hvac_tv': {
            'handle': '@HVACTV',
            'name': 'HVAC TV',
            'url': 'https://www.youtube.com/@HVACTV',
            'category': 'industry_news',
            'content_focus': ['industry_trends', 'product_reviews', 'business_insights'],
            'target_audience': 'hvac_professionals',
            'competitive_priority': 'medium',
            'analysis_focus': ['industry_coverage', 'product_insights', 'business_content']
        }
    }

    def __init__(self, data_dir: Path, logs_dir: Path, competitor_key: str):
        """Initialize enhanced YouTube competitive scraper for specific competitor."""
        if competitor_key not in self.COMPETITOR_CHANNELS:
            raise ConfigurationError(
                f"Unknown YouTube competitor: {competitor_key}",
                {'available_competitors': list(self.COMPETITOR_CHANNELS.keys())}
            )

        competitor_info = self.COMPETITOR_CHANNELS[competitor_key]

        # Create competitive configuration with enhanced settings
        config = CompetitiveConfig(
            source_name=f"YouTube_{competitor_info['name'].replace(' ', '')}",
            brand_name="hkia",
            data_dir=data_dir,
            logs_dir=logs_dir,
            competitor_name=competitor_key,
            base_url=competitor_info['url'],
            timezone=os.getenv('TIMEZONE', 'America/Halifax'),
            use_proxy=False,  # YouTube API doesn't require proxy
            request_delay=1.0,  # Reduced for API calls
            backlog_limit=int(os.getenv('YOUTUBE_COMPETITIVE_BACKLOG_LIMIT', '200'))
        )

        super().__init__(config)

        # Store competitor details with enhanced metadata
        self.competitor_key = competitor_key
        self.competitor_info = competitor_info
        self.channel_handle = competitor_info['handle']
        self.competitive_category = competitor_info['category']
        self.content_focus = competitor_info['content_focus']
        self.target_audience = competitor_info['target_audience']
        self.competitive_priority = competitor_info['competitive_priority']
        self.analysis_focus = competitor_info['analysis_focus']

        # YouTube API setup
        self.api_key = os.getenv('YOUTUBE_API_KEY')
        if not self.api_key:
            raise ConfigurationError(
                "YouTube API key not configured",
                {'env_var': 'YOUTUBE_API_KEY'}
            )

        self.youtube = build('youtube', 'v3', developerKey=self.api_key)

        # Channel metadata storage
        self.channel_id = None
        self.uploads_playlist_id = None
        self.channel_metadata = {}

        # Centralized quota management
        self.quota_manager = YouTubeQuotaManager()

        # Enhanced state management for competitive intelligence
        self.competitive_state_cache = {}

        # Initialize channel info
        self._get_channel_info()

        # Log comprehensive initialization details
        self.logger.info(f"Enhanced YouTube competitive scraper initialized for {competitor_info['name']}")
        self.logger.info(f"Category: {self.competitive_category}, Priority: {self.competitive_priority}")
        self.logger.info(f"Content Focus: {', '.join(self.content_focus)}")
        self.logger.info(f"Analysis Focus: {', '.join(self.analysis_focus)}")

        # Log quota status
        quota_status = self.quota_manager.get_quota_status()
        self.logger.info(f"Shared API quota: {quota_status['quota_used']}/{quota_status['quota_limit']} ({quota_status['quota_percentage']:.1f}%)")

    def _track_quota(self, operation: str, count: int = 1) -> bool:
        """Track YouTube API quota usage via centralized manager."""
        if self.quota_manager.check_and_reserve_quota(operation, count):
            quota_status = self.quota_manager.get_quota_status()
            self.logger.debug(f"Reserved quota for {operation}x{count}. Total: {quota_status['quota_used']}/{quota_status['quota_limit']} ({quota_status['quota_percentage']:.1f}%)")
            return True
        else:
            quota_status = self.quota_manager.get_quota_status()
            self.logger.warning(f"YouTube API quota limit would be exceeded for {operation}x{count}. Current: {quota_status['quota_used']}/{quota_status['quota_limit']}")
            return False

    def _release_quota_on_error(self, operation: str, count: int = 1):
        """Release quota allocation if operation fails."""
        self.quota_manager.release_quota(operation, count)
        self.logger.debug(f"Released quota for failed {operation}x{count}")

    def get_quota_status(self) -> Dict[str, Any]:
        """Get current centralized quota status."""
        return self.quota_manager.get_quota_status()

    def _get_channel_info(self) -> bool:
        """Get enhanced channel information and uploads playlist ID."""
        if self.channel_id and self.uploads_playlist_id:
            return True

        try:
            handle = self.channel_handle.replace('@', '')

            if not self._track_quota('channels_list'):
                self.logger.warning(f"Cannot get channel info due to quota limit")
                return False

            try:
                # Use forHandle parameter for YouTube Data API v3
                response = self.youtube.channels().list(
                    part='snippet,statistics,contentDetails,brandingSettings',
                    forHandle=handle
                ).execute()

                if response.get('items'):
                    channel_data = response['items'][0]
                    self.channel_id = channel_data['id']
                    self.uploads_playlist_id = channel_data['contentDetails']['relatedPlaylists']['uploads']

                    # Store enhanced channel metadata for competitive analysis
                    snippet = channel_data['snippet']
                    stats = channel_data.get('statistics', {})
                    branding = channel_data.get('brandingSettings', {})

                    self.channel_metadata = {
                        'title': snippet['title'],
                        'description': snippet.get('description', '')[:1000] + ('...' if len(snippet.get('description', '')) > 1000 else ''),
                        'subscriber_count': int(stats.get('subscriberCount', 0)),
                        'video_count': int(stats.get('videoCount', 0)),
                        'view_count': int(stats.get('viewCount', 0)),
                        'published_at': snippet['publishedAt'],
                        'channel_id': self.channel_id,
                        'country': snippet.get('country'),
                        'default_language': snippet.get('defaultLanguage'),
                        'keywords': branding.get('channel', {}).get('keywords', ''),
                        'competitor_metadata': {
                            'competitive_category': self.competitive_category,
                            'content_focus': self.content_focus,
                            'target_audience': self.target_audience,
                            'competitive_priority': self.competitive_priority,
                            'analysis_focus': self.analysis_focus
                        },
                        'analysis_timestamp': datetime.now(self.tz).isoformat()
                    }

                    # Calculate competitive metrics
                    subscriber_count = self.channel_metadata['subscriber_count']
                    video_count = self.channel_metadata['video_count']

                    if video_count > 0:
                        avg_views_per_video = self.channel_metadata['view_count'] / video_count
                        self.channel_metadata['avg_views_per_video'] = int(avg_views_per_video)

                    self.logger.info(f"Enhanced channel data acquired: {self.channel_metadata['title']}")
                    self.logger.info(f"Subscribers: {subscriber_count:,}, Videos: {video_count:,}")
                    self.logger.info(f"Total Views: {self.channel_metadata['view_count']:,}")
                    if 'avg_views_per_video' in self.channel_metadata:
                        self.logger.info(f"Avg Views/Video: {self.channel_metadata['avg_views_per_video']:,}")

                    return True
                else:
                    self.logger.error(f"No channel found for handle {handle}")
                    self._release_quota_on_error('channels_list')
                    return False

            except HttpError as api_error:
                self.logger.error(f"YouTube API error getting channel info: {api_error}")
                self._release_quota_on_error('channels_list')
                handle_youtube_api_error(api_error, "getting channel info")
                return False

        except (ValueError, KeyError, TypeError) as e:
            self.logger.error(f"Data parsing error getting channel info: {e}")
            return False
        except Exception as e:
            self.logger.error(f"Unexpected error getting channel info: {e}")
            return False

    def discover_content_urls(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
        """Enhanced video discovery from competitor's YouTube channel with priority handling."""
        if not self._get_channel_info():
            self.logger.error("Cannot discover content without channel info")
            return []

        # Adjust discovery based on competitive priority
        discovery_limit = limit or (150 if self.competitive_priority == 'high' else 100)

        videos = []
        next_page_token = None
        operations_count = 0

        try:
            self.logger.info(f"Starting enhanced content discovery for {self.competitor_info['name']} (limit: {discovery_limit})")

            while len(videos) < discovery_limit:
                if not self._track_quota('playlist_items_list'):
                    self.logger.warning("Quota limit reached, stopping discovery early")
                    break

                try:
                    # Get videos from uploads playlist with enhanced data
                    batch_size = min(50, discovery_limit - len(videos))
                    response = self.youtube.playlistItems().list(
                        part='snippet,contentDetails,status',
                        playlistId=self.uploads_playlist_id,
                        maxResults=batch_size,
                        pageToken=next_page_token
                    ).execute()

                    operations_count += 1

                    for item in response.get('items', []):
                        video_id = item['contentDetails']['videoId']
                        snippet = item['snippet']
                        status = item.get('status', {})

                        # Skip private videos
                        if status.get('privacyStatus') == 'private':
                            continue

                        # Parse publish date for competitive analysis
                        try:
                            published_dt = datetime.fromisoformat(snippet['publishedAt'].replace('Z', '+00:00'))
                            days_since_publish = (datetime.now(published_dt.tzinfo) - published_dt).days
                        except:
                            days_since_publish = None

                        video_data = {
                            'url': f"https://www.youtube.com/watch?v={video_id}",
                            'video_id': video_id,
                            'title': snippet['title'],
                            'published_at': snippet['publishedAt'],
                            'description': snippet['description'][:500] + ('...' if len(snippet['description']) > 500 else ''),
                            'thumbnail_url': snippet['thumbnails'].get('maxres', snippet['thumbnails'].get('high', {})).get('url', ''),
                            'channel_title': snippet['channelTitle'],
                            'position': snippet.get('position', 0),
                            'privacy_status': status.get('privacyStatus', 'public'),
                            # Competitive analysis metadata
                            'days_since_publish': days_since_publish,
                            'competitor_key': self.competitor_key,
                            'competitive_priority': self.competitive_priority,
                            'content_focus_tags': self._analyze_title_for_focus(snippet['title']),
                            'discovery_timestamp': datetime.now(self.tz).isoformat()
                        }
                        videos.append(video_data)

                    next_page_token = response.get('nextPageToken')
                    if not next_page_token:
                        self.logger.info(f"Reached end of playlist for {self.competitor_info['name']}")
                        break

                    # Rate limiting between API calls
                    time.sleep(0.5)

                except HttpError as api_error:
                    self.logger.error(f"YouTube API error in discovery batch {operations_count}: {api_error}")
                    self._release_quota_on_error('playlist_items_list')
                    try:
                        handle_youtube_api_error(api_error, f"discovery batch {operations_count}")
                    except QuotaExceededError:
                        self.logger.warning("API quota exceeded, stopping discovery early")
                        break
                    except YouTubeAPIError:
                        # Continue with next batch after API error
                        continue

        except (ValueError, KeyError, TypeError) as e:
            self.logger.error(f"Data processing error in content discovery: {e}")
        except Exception as e:
            self.logger.error(f"Unexpected error in enhanced content discovery: {e}")

        # Log discovery results with competitive context
        self.logger.info(f"Enhanced discovery complete: {len(videos)} videos from {self.competitor_info['name']}")
        if videos:
            recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 30]
            self.logger.info(f"Recent content (30 days): {len(recent_videos)} videos")

            # Analyze content focus distribution
            focus_distribution = defaultdict(int)
            for video in videos:
                for tag in video.get('content_focus_tags', []):
                    focus_distribution[tag] += 1

            if focus_distribution:
                top_focuses = sorted(focus_distribution.items(), key=lambda x: x[1], reverse=True)[:3]
                self.logger.info(f"Top content focuses: {', '.join([f'{focus}({count})' for focus, count in top_focuses])}")

        return videos

    def _analyze_title_for_focus(self, title: str) -> List[str]:
        """Analyze video title to identify content focus areas."""
        title_lower = title.lower()
        focus_tags = []

        # Define focus keywords based on competitive analysis
        focus_keywords = {
            'troubleshooting': ['troubleshoot', 'problem', 'fix', 'repair', 'diagnose', 'issue', 'error'],
            'installation': ['install', 'setup', 'mount', 'connect', 'wiring'],
            'maintenance': ['maintain', 'service', 'clean', 'replace', 'check'],
            'hvac_systems': ['hvac', 'air conditioner', 'furnace', 'heat pump', 'ductwork'],
            'refrigeration': ['refrigerat', 'cooling', 'condenser', 'evaporator', 'compressor'],
            'commercial': ['commercial', 'industrial', 'building', 'facility'],
            'residential': ['home', 'house', 'residential', 'homeowner'],
            'training': ['training', 'learn', 'course', 'education', 'tutorial'],
            'tools': ['tool', 'equipment', 'meter', 'gauge'],
            'safety': ['safety', 'danger', 'hazard', 'protection']
        }

        for focus, keywords in focus_keywords.items():
            if any(keyword in title_lower for keyword in keywords):
                focus_tags.append(focus)

        # Add competitive-specific focus tags
        if any(word in title_lower for word in self.content_focus):
            for focus_area in self.content_focus:
                if focus_area not in focus_tags:
                    focus_tags.append(focus_area)

        return focus_tags[:5]  # Limit to top 5 focus areas

    def scrape_content_item(self, url: str) -> Optional[Dict[str, Any]]:
        """Enhanced video content scraping with competitive intelligence analysis."""
        try:
            # Extract video ID from URL
            video_id = None
            if 'watch?v=' in url:
                video_id = url.split('watch?v=')[1].split('&')[0]
            elif 'youtu.be/' in url:
                video_id = url.split('youtu.be/')[1].split('?')[0]

            if not video_id:
                raise DataValidationError(
                    "Invalid YouTube URL format",
                    field="url",
                    value=url
                )

            if not self._track_quota('videos_list'):
                self.logger.warning("Quota limit reached, skipping video scraping")
                return None

            try:
                # Get comprehensive video details with enhanced parts
                response = self.youtube.videos().list(
                    part='snippet,statistics,contentDetails,status,topicDetails',
                    id=video_id
                ).execute()

                if not response.get('items'):
                    self.logger.warning(f"No video data found for ID: {video_id}")
                    self._release_quota_on_error('videos_list')
                    raise YouTubeVideoNotFoundError(video_id)

                video_data = response['items'][0]
                snippet = video_data['snippet']
                statistics = video_data.get('statistics', {})
                content_details = video_data.get('contentDetails', {})
                status = video_data.get('status', {})
                topic_details = video_data.get('topicDetails', {})

                # Parse and calculate enhanced metrics
                duration = content_details.get('duration', 'PT0S')
                duration_seconds = self._parse_duration(duration)

                # Enhanced date processing
                published_at = snippet['publishedAt']
                try:
                    published_date = datetime.fromisoformat(published_at.replace('Z', '+00:00'))
                    formatted_date = published_date.strftime('%Y-%m-%d %H:%M:%S UTC')
                    days_since_publish = (datetime.now(published_date.tzinfo) - published_date).days
                except:
                    formatted_date = published_at
                    days_since_publish = None

                # Calculate competitive engagement metrics
                view_count = int(statistics.get('viewCount', 0))
                like_count = int(statistics.get('likeCount', 0))
                comment_count = int(statistics.get('commentCount', 0))

                engagement_rate = 0
                if view_count > 0:
                    engagement_rate = ((like_count + comment_count) / view_count) * 100

                # Analyze competitive positioning
                content_focus_tags = self._analyze_title_for_focus(snippet['title'])
                description_focus = self._analyze_description_for_competitive_intel(snippet.get('description', ''))

                # Calculate content quality score
                quality_metrics = self._calculate_content_quality_score(
                    title=snippet['title'],
                    description=snippet.get('description', ''),
                    duration_seconds=duration_seconds,
                    tags=snippet.get('tags', []),
                    view_count=view_count,
                    engagement_rate=engagement_rate
                )

                scraped_item = {
                    'id': video_id,
                    'url': url,
                    'title': snippet['title'],
                    'description': snippet['description'],
                    'author': snippet['channelTitle'],
                    'publish_date': formatted_date,
                    'duration': duration_seconds,
                    'view_count': view_count,
                    'like_count': like_count,
                    'comment_count': comment_count,
                    'engagement_rate': round(engagement_rate, 3),
                    'privacy_status': status.get('privacyStatus', 'public'),
                    'thumbnail_url': snippet['thumbnails'].get('maxres', snippet['thumbnails'].get('high', {})).get('url', ''),
                    'tags': snippet.get('tags', []),
                    'category_id': snippet.get('categoryId'),
                    'default_language': snippet.get('defaultLanguage'),
                    'topic_categories': topic_details.get('topicCategories', []),

                    # Enhanced competitive intelligence metadata
                    'type': 'youtube_video',
                    'competitor': self.competitor_key,
                    'competitive_category': self.competitive_category,
                    'competitive_priority': self.competitive_priority,
                    'target_audience': self.target_audience,
                    'content_focus_tags': content_focus_tags,
                    'description_analysis': description_focus,
                    'quality_metrics': quality_metrics,
                    'days_since_publish': days_since_publish,
                    'capture_timestamp': datetime.now(self.tz).isoformat(),
                    'extraction_method': 'youtube_data_api_v3_enhanced',

                    # Comprehensive social metrics for competitive analysis
                    'social_metrics': {
                        'views': view_count,
                        'likes': like_count,
                        'comments': comment_count,
                        'engagement_rate': engagement_rate,
                        'views_per_day': round(view_count / max(days_since_publish, 1), 2) if days_since_publish else 0,
                        'subscriber_engagement': self._estimate_subscriber_engagement(view_count)
                    },

                    # Content analysis for competitive intelligence
                    'word_count': len(snippet['description'].split()),
                    'title_length': len(snippet['title']),
                    'tag_count': len(snippet.get('tags', [])),
                    'content_type': self._classify_content_type(snippet['title'], duration_seconds),

                    # Formatted content for markdown output
                    'content': self._format_competitive_content(snippet, statistics, quality_metrics, content_focus_tags)
                }

                # Rate limiting with reduced delay for API calls
                time.sleep(0.5)

                return scraped_item

            except HttpError as api_error:
                self.logger.error(f"YouTube API error scraping video {url}: {api_error}")
                self._release_quota_on_error('videos_list')
                handle_youtube_api_error(api_error, f"scraping video {video_id}")
                return None

        except DataValidationError:
            # Re-raise validation errors
            raise
        except YouTubeVideoNotFoundError:
            # Re-raise not found errors
            raise
        except (ValueError, KeyError, TypeError) as e:
            self.logger.error(f"Data processing error scraping video {url}: {e}")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error scraping video {url}: {e}")
            return None

    def _parse_duration(self, duration_str: str) -> int:
        """Parse ISO 8601 duration to seconds."""
        try:
            # Remove PT prefix
            duration_str = duration_str.replace('PT', '')

            total_seconds = 0

            # Parse hours
            if 'H' in duration_str:
                hours, duration_str = duration_str.split('H')
                total_seconds += int(hours) * 3600

            # Parse minutes
            if 'M' in duration_str:
                minutes, duration_str = duration_str.split('M')
                total_seconds += int(minutes) * 60

            # Parse seconds
            if 'S' in duration_str:
                seconds = duration_str.replace('S', '')
                total_seconds += int(seconds)

            return total_seconds
        except:
            return 0

    def _analyze_description_for_competitive_intel(self, description: str) -> Dict[str, Any]:
        """Analyze video description for competitive intelligence insights."""
        if not description:
            return {}

        description_lower = description.lower()

        analysis = {
            'length': len(description),
            'word_count': len(description.split()),
            'contains_links': 'http' in description_lower,
            'contains_timestamps': ':' in description and any(char.isdigit() for char in description),
            'contains_contact_info': any(term in description_lower for term in ['email', 'phone', 'contact', '@']),
            'contains_cta': any(term in description_lower for term in ['subscribe', 'like', 'follow', 'visit', 'check out']),
            'mentions_products': any(term in description_lower for term in ['product', 'equipment', 'tool', 'brand']),
            'technical_depth': self._assess_technical_depth(description_lower),
            'educational_indicators': self._count_educational_indicators(description_lower)
        }

        return analysis

    def _assess_technical_depth(self, text: str) -> str:
        """Assess the technical depth of content based on description."""
        technical_terms = [
            'refrigerant', 'compressor', 'evaporator', 'condenser', 'superheat', 'subcooling',
            'pressure', 'temperature', 'cfm', 'btu', 'tonnage', 'efficiency', 'seer',
            'troubleshoot', 'diagnostic', 'multimeter', 'manifold', 'gauge'
        ]

        technical_count = sum(1 for term in technical_terms if term in text)

        if technical_count >= 5:
            return 'advanced'
        elif technical_count >= 2:
            return 'intermediate'
        else:
            return 'basic'

    def _count_educational_indicators(self, text: str) -> int:
        """Count educational indicators in content."""
        educational_terms = [
            'learn', 'understand', 'explain', 'demonstrate', 'show', 'teach',
            'step', 'guide', 'tutorial', 'tips', 'basics', 'fundamentals'
        ]

        return sum(1 for term in educational_terms if term in text)

    def _calculate_content_quality_score(self, title: str, description: str, duration_seconds: int,
                                       tags: List[str], view_count: int, engagement_rate: float) -> Dict[str, Any]:
        """Calculate comprehensive content quality score for competitive analysis."""

        # Title quality (0-25 points)
        title_score = min(25, len(title) // 4)  # Longer titles generally better for SEO
        if any(word in title.lower() for word in ['how to', 'guide', 'tips', 'tutorial']):
            title_score += 5

        # Description quality (0-25 points)
        desc_words = len(description.split())
        desc_score = min(25, desc_words // 10)  # 250+ words = max score

        # Duration appropriateness (0-20 points)
        duration_score = 0
        if 300 <= duration_seconds <= 1800:  # 5-30 minutes is optimal
            duration_score = 20
        elif 180 <= duration_seconds < 300 or 1800 < duration_seconds <= 3600:
            duration_score = 15
        elif duration_seconds > 60:
            duration_score = 10

        # Tag optimization (0-15 points)
        tag_score = min(15, len(tags) * 2)  # Up to 7-8 tags is optimal

        # Engagement quality (0-15 points)
        engagement_score = min(15, engagement_rate * 3)  # 5% engagement = max score

        total_score = title_score + desc_score + duration_score + tag_score + engagement_score

        return {
            'total_score': round(total_score, 1),
            'max_score': 100,
            'percentage': round((total_score / 100) * 100, 1),
            'breakdown': {
                'title_score': title_score,
                'description_score': desc_score,
                'duration_score': duration_score,
                'tag_score': tag_score,
                'engagement_score': round(engagement_score, 1)
            },
            'quality_tier': self._get_quality_tier(total_score)
        }

    def _get_quality_tier(self, score: float) -> str:
        """Get quality tier based on total score."""
        if score >= 80:
            return 'excellent'
        elif score >= 65:
            return 'good'
        elif score >= 50:
            return 'average'
        elif score >= 35:
            return 'below_average'
        else:
            return 'poor'

    def _estimate_subscriber_engagement(self, view_count: int) -> str:
        """Estimate subscriber engagement level based on view count relative to channel size."""
        if not self.channel_metadata.get('subscriber_count'):
            return 'unknown'

        subscriber_count = self.channel_metadata['subscriber_count']
        if subscriber_count == 0:
            return 'new_channel'

        engagement_ratio = view_count / subscriber_count

        if engagement_ratio >= 0.3:
            return 'excellent'
        elif engagement_ratio >= 0.15:
            return 'good'
        elif engagement_ratio >= 0.05:
            return 'average'
        else:
            return 'low'

    def _classify_content_type(self, title: str, duration_seconds: int) -> str:
        """Classify content type based on title and duration."""
        title_lower = title.lower()

        # Quick content
        if duration_seconds < 180:
            return 'short_tip'

        # Tutorial indicators
        if any(word in title_lower for word in ['how to', 'tutorial', 'guide', 'step by step']):
            if duration_seconds > 600:
                return 'comprehensive_tutorial'
            else:
                return 'quick_tutorial'

        # Troubleshooting content
        if any(word in title_lower for word in ['troubleshoot', 'fix', 'repair', 'problem']):
            return 'troubleshooting'

        # Review content
        if any(word in title_lower for word in ['review', 'unbox', 'test']):
            return 'product_review'

        # Educational content
        if any(word in title_lower for word in ['explain', 'basics', 'fundamentals', 'learn']):
            return 'educational'

        # Default based on duration
        if duration_seconds > 1800:
            return 'long_form'
        else:
            return 'standard'

    def _format_competitive_content(self, snippet: Dict, statistics: Dict,
                                  quality_metrics: Dict, content_focus_tags: List[str]) -> str:
        """Format content with competitive intelligence focus."""
        lines = []

        lines.append("**Enhanced Video Analysis:**")
        lines.append("")
        lines.append(f"**Description:** {snippet['description'][:500]}{'...' if len(snippet['description']) > 500 else ''}")
        lines.append("")

        if snippet.get('tags'):
            lines.append(f"**Tags:** {', '.join(snippet['tags'][:10])}")
            lines.append("")

        lines.append("**Competitive Intelligence:**")
        lines.append(f"- Content Focus: {', '.join(content_focus_tags) if content_focus_tags else 'General'}")
        lines.append(f"- Quality Score: {quality_metrics['percentage']}% ({quality_metrics['quality_tier']})")
        lines.append(f"- Engagement Rate: {statistics.get('viewCount', 0) and statistics.get('likeCount', 0)} likes per {statistics.get('viewCount', 0)} views")
        lines.append("")

        return "\n".join(lines)

    def get_competitor_metadata(self) -> Dict[str, Any]:
        """Get enhanced metadata about the competitor channel."""
        quota_status = self.quota_manager.get_quota_status()

        return {
            'competitor_key': self.competitor_key,
            'competitor_name': self.competitor_info['name'],
            'channel_handle': self.channel_handle,
            'channel_url': self.competitor_info['url'],
            'channel_metadata': self.channel_metadata,
            'competitive_profile': {
                'category': self.competitive_category,
                'content_focus': self.content_focus,
                'target_audience': self.target_audience,
                'competitive_priority': self.competitive_priority,
                'analysis_focus': self.analysis_focus
            },
            'api_quota_status': quota_status,
            'scraper_version': '2.0_enhanced',
            'last_updated': datetime.now(self.tz).isoformat()
        }

    def run_competitor_analysis(self) -> Dict[str, Any]:
        """Run comprehensive competitive analysis with enhanced intelligence."""
        self.logger.info(f"Running enhanced YouTube competitor analysis for {self.competitor_info['name']}")

        try:
            # Get comprehensive video sample for analysis
            analysis_limit = 50 if self.competitive_priority == 'high' else 30
            recent_videos = self.discover_content_urls(analysis_limit)

            if not recent_videos:
                return {'error': 'No recent videos found', 'competitor': self.competitor_key}

            self.logger.info(f"Analyzing {len(recent_videos)} videos for competitive intelligence")

            # Comprehensive competitive analysis
            analysis = {
                'competitor': self.competitor_key,
                'competitor_name': self.competitor_info['name'],
                'competitive_profile': {
                    'category': self.competitive_category,
                    'content_focus': self.content_focus,
                    'target_audience': self.target_audience,
                    'competitive_priority': self.competitive_priority,
                    'analysis_focus': self.analysis_focus
                },
                'sample_size': len(recent_videos),
                'channel_metadata': self.channel_metadata,
                'publishing_analysis': self._analyze_publishing_patterns(recent_videos),
                'content_analysis': self._analyze_enhanced_content_themes(recent_videos),
                'engagement_analysis': self._analyze_engagement_patterns(recent_videos),
                'competitive_positioning': self._analyze_competitive_positioning(recent_videos),
                'content_gaps': self._identify_potential_content_gaps(recent_videos),
                'api_quota_status': self.quota_manager.get_quota_status(),
                'analysis_timestamp': datetime.now(self.tz).isoformat()
            }

            # Log key insights
            self._log_competitive_insights(analysis)

            return analysis

        except Exception as e:
            self.logger.error(f"Error in enhanced competitor analysis: {e}")
            return {'error': str(e), 'competitor': self.competitor_key}

    def _analyze_publishing_patterns(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze publishing frequency and timing patterns."""
        try:
            if not videos:
                return {}

            # Parse publication dates
            pub_dates = []
            for video in videos:
                try:
                    pub_date = datetime.fromisoformat(video['published_at'].replace('Z', '+00:00'))
                    pub_dates.append(pub_date)
                except:
                    continue

            if not pub_dates:
                return {}

            # Calculate publishing frequency
            pub_dates.sort()
            if len(pub_dates) > 1:
                date_range = (pub_dates[-1] - pub_dates[0]).days
                frequency = len(pub_dates) / max(date_range, 1) if date_range > 0 else 0
            else:
                frequency = 0

            # Analyze publishing days and times
            weekdays = [d.weekday() for d in pub_dates]  # 0=Monday, 6=Sunday
            hours = [d.hour for d in pub_dates]

            return {
                'total_videos_analyzed': len(pub_dates),
                'date_range_days': date_range if len(pub_dates) > 1 else 0,
                'average_frequency_per_day': round(frequency, 2),
                'most_common_weekday': max(set(weekdays), key=weekdays.count) if weekdays else None,
                'most_common_hour': max(set(hours), key=hours.count) if hours else None,
                'latest_video_date': pub_dates[-1].isoformat() if pub_dates else None
            }

        except Exception as e:
            self.logger.error(f"Error analyzing publishing patterns: {e}")
            return {}

    def _analyze_enhanced_content_themes(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Enhanced content theme analysis with competitive intelligence."""
        try:
            if not videos:
                return {}

            # Collect comprehensive text analysis
            all_text = []
            title_words = []
            content_focus_distribution = defaultdict(int)
            content_types = defaultdict(int)

            for video in videos:
                title = video.get('title', '').lower()
                description = video.get('description', '').lower()

                all_text.append(title + ' ' + description)
                title_words.extend(title.split())

                # Track content focus tags
                for tag in video.get('content_focus_tags', []):
                    content_focus_distribution[tag] += 1

                # Track content types (would be calculated in scraping)
                content_type = self._classify_content_type(video.get('title', ''), 600)  # Default duration
                content_types[content_type] += 1

            # Enhanced keyword analysis
            word_freq = {}
            for word in title_words:
                # Filter out common words but include HVAC-specific terms
                if (len(word) > 3 and
                    word not in ['hvac', 'with', 'this', 'that', 'from', 'your', 'they', 'have', 'been', 'will'] and
                    not word.isdigit()):
                    word_freq[word] = word_freq.get(word, 0) + 1

            # Get top keywords and focus areas
            top_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15]
            top_content_focuses = sorted(content_focus_distribution.items(), key=lambda x: x[1], reverse=True)[:10]
            top_content_types = sorted(content_types.items(), key=lambda x: x[1], reverse=True)

            return {
                'total_videos_analyzed': len(videos),
                'top_title_keywords': [{'keyword': k, 'frequency': f, 'percentage': round((f/len(videos))*100, 1)} for k, f in top_keywords],
                'content_focus_distribution': [{'focus': f, 'count': c, 'percentage': round((c/len(videos))*100, 1)} for f, c in top_content_focuses],
                'content_type_distribution': [{'type': t, 'count': c, 'percentage': round((c/len(videos))*100, 1)} for t, c in top_content_types],
                'average_title_length': round(sum(len(v.get('title', '')) for v in videos) / len(videos), 1) if videos else 0,
                'videos_with_descriptions': sum(1 for v in videos if v.get('description', '').strip()),
                'content_diversity_score': len(content_focus_distribution),  # Number of different focus areas
                'primary_content_focus': top_content_focuses[0][0] if top_content_focuses else 'general',
                'content_strategy_insights': self._analyze_content_strategy(top_content_focuses, top_content_types)
            }

        except (ValueError, KeyError, TypeError, ZeroDivisionError) as e:
            self.logger.error(f"Data processing error analyzing content themes: {e}")
            return {}
        except Exception as e:
            self.logger.error(f"Unexpected error analyzing enhanced content themes: {e}")
            return {}

    def _analyze_content_strategy(self, content_focuses: List[Tuple], content_types: List[Tuple]) -> Dict[str, str]:
        """Analyze content strategy based on focus and type distributions."""
        insights = {}

        if content_focuses:
            primary_focus = content_focuses[0][0]
            focus_concentration = content_focuses[0][1] / sum(count for _, count in content_focuses)

            if focus_concentration > 0.5:
                insights['focus_strategy'] = f"Highly specialized in {primary_focus} ({focus_concentration*100:.1f}% of content)"
            elif focus_concentration > 0.3:
                insights['focus_strategy'] = f"Primarily focused on {primary_focus} with some diversification"
            else:
                insights['focus_strategy'] = "Diversified content strategy across multiple focus areas"

        if content_types:
            primary_type = content_types[0][0]
            type_concentration = content_types[0][1] / sum(count for _, count in content_types)

            if type_concentration > 0.6:
                insights['content_type_strategy'] = f"Heavily focused on {primary_type} content"
            else:
                insights['content_type_strategy'] = "Mixed content type strategy"

        return insights

    def _analyze_engagement_patterns(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze engagement patterns for competitive intelligence."""
        try:
            if not videos:
                return {}

            # Note: This analysis would be more complete with actual engagement data
            # For now, we'll analyze what we have from the discovery phase

            recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 30]
            older_videos = [v for v in videos if v.get('days_since_publish', 0) > 30]

            content_focus_engagement = defaultdict(list)
            for video in videos:
                for focus in video.get('content_focus_tags', []):
                    content_focus_engagement[focus].append(video)

            # Calculate average engagement by content focus
            focus_performance = {}
            for focus, focus_videos in content_focus_engagement.items():
                if len(focus_videos) >= 3:  # Only analyze focuses with sufficient data
                    avg_days_old = sum(v.get('days_since_publish', 0) for v in focus_videos) / len(focus_videos)
                    focus_performance[focus] = {
                        'video_count': len(focus_videos),
                        'avg_days_since_publish': round(avg_days_old, 1),
                        'sample_titles': [v.get('title', '')[:50] for v in focus_videos[:3]]
                    }

            return {
                'total_videos_analyzed': len(videos),
                'recent_videos_30d': len(recent_videos),
                'older_videos': len(older_videos),
                'content_focus_performance': focus_performance,
                'publishing_consistency': {
                    'recent_publishing_rate': len(recent_videos) / 30 if recent_videos else 0,
                    'content_freshness_score': len(recent_videos) / len(videos) if videos else 0
                },
                'engagement_insights': self._generate_engagement_insights(recent_videos, content_focus_engagement)
            }

        except (ValueError, KeyError, TypeError, ZeroDivisionError) as e:
            self.logger.error(f"Data processing error analyzing engagement patterns: {e}")
            return {}
        except Exception as e:
            self.logger.error(f"Unexpected error analyzing engagement patterns: {e}")
            return {}

    def _generate_engagement_insights(self, recent_videos: List, content_focus_engagement: Dict) -> Dict[str, str]:
        """Generate insights about engagement patterns."""
        insights = {}

        if recent_videos:
            recent_rate = len(recent_videos) / 30
            if recent_rate >= 1:
                insights['publishing_frequency'] = f"High activity: ~{recent_rate:.1f} videos per day"
            elif recent_rate >= 0.2:
                insights['publishing_frequency'] = f"Regular activity: ~{recent_rate*7:.1f} videos per week"
            else:
                insights['publishing_frequency'] = "Infrequent publishing pattern"

        # Analyze content focus diversity
        active_focuses = len([f for f, videos in content_focus_engagement.items() if len(videos) >= 2])
        if active_focuses >= 5:
            insights['content_diversity'] = "High content diversity across multiple focus areas"
        elif active_focuses >= 3:
            insights['content_diversity'] = "Moderate content diversity"
        else:
            insights['content_diversity'] = "Narrow content focus"

        return insights

    def _validate_video_data(self, video_data: Dict[str, Any]) -> bool:
        """Validate video data structure for required fields."""
        required_fields = ['id', 'snippet']
        return all(field in video_data for field in required_fields)

    def _sanitize_text_content(self, text: str, max_length: int = 1000) -> str:
        """Sanitize and truncate text content."""
        if not isinstance(text, str):
            return ""

        # Remove control characters and excessive whitespace
        sanitized = ' '.join(text.split())

        # Truncate if necessary
        if len(sanitized) > max_length:
            sanitized = sanitized[:max_length] + "..."

        return sanitized

    @contextlib.contextmanager
    def _quota_context(self, operation: str, count: int = 1):
        """Context manager for quota operations with automatic cleanup."""
        reserved = False
        try:
            if not self._track_quota(operation, count):
                raise QuotaExceededError(
                    f"Cannot reserve quota for {operation}",
                    quota_used=self.quota_manager.quota_used,
                    quota_limit=self.quota_manager.daily_quota_limit
                )
            reserved = True
            yield
        except Exception:
            if reserved:
                self._release_quota_on_error(operation, count)
            raise

    def cleanup_resources(self) -> None:
        """Cleanup resources and connections."""
        try:
            # Close any open connections
            if hasattr(self, 'session') and self.session:
                self.session.close()

            # Clear caches
            self.content_cache.clear()
            self.competitive_state_cache.clear()

            self.logger.info(f"Cleaned up YouTube scraper resources for {self.competitor_key}")

        except Exception as e:
            self.logger.warning(f"Error during resource cleanup: {e}")

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit with resource cleanup."""
        self.cleanup_resources()

    def _analyze_competitive_positioning(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze competitive positioning relative to HVAC Know It All."""
        try:
            # Analyze content positioning
            positioning = {
                'content_overlap': self._calculate_content_overlap(videos),
                'differentiation_factors': self._identify_differentiation_factors(videos),
                'competitive_advantages': self._identify_competitive_advantages(videos),
                'potential_threats': self._identify_potential_threats(videos),
                'market_positioning': self._assess_market_positioning()
            }

            return positioning

        except (ValueError, KeyError, TypeError, ZeroDivisionError) as e:
            self.logger.error(f"Data processing error analyzing competitive positioning: {e}")
            return {}
        except Exception as e:
            self.logger.error(f"Unexpected error analyzing competitive positioning: {e}")
            return {}

    def _calculate_content_overlap(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Calculate content overlap with HVAC Know It All focus areas."""
        hkia_focus_areas = ['troubleshooting', 'hvac_systems', 'maintenance', 'training', 'tools']

        overlap_count = defaultdict(int)
        total_videos = len(videos)

        for video in videos:
            video_focuses = video.get('content_focus_tags', [])
            for focus in video_focuses:
                if focus in hkia_focus_areas:
                    overlap_count[focus] += 1

        overlap_percentage = sum(overlap_count.values()) / total_videos * 100 if total_videos > 0 else 0

        return {
            'total_overlap_percentage': round(overlap_percentage, 1),
            'overlapping_focus_areas': dict(overlap_count),
            'direct_competition_level': 'high' if overlap_percentage > 60 else 'medium' if overlap_percentage > 30 else 'low'
        }

    def _identify_differentiation_factors(self, videos: List[Dict[str, Any]]) -> List[str]:
        """Identify key differentiation factors."""
        factors = []

        # Analyze content focuses that might be different
        all_focuses = []
        for video in videos:
            all_focuses.extend(video.get('content_focus_tags', []))

        focus_dist = defaultdict(int)
        for focus in all_focuses:
            focus_dist[focus] += 1

        # Look for unique or heavily emphasized areas
        total_focus_instances = sum(focus_dist.values())
        for focus, count in focus_dist.items():
            percentage = (count / total_focus_instances) * 100
            if percentage > 25:  # Major focus area
                if focus in ['commercial', 'refrigeration', 'safety']:
                    factors.append(f"Strong emphasis on {focus} content ({percentage:.1f}%)")
                elif focus == 'training':
                    factors.append(f"Heavy focus on training/educational content ({percentage:.1f}%)")

        # Analyze content types
        if self.competitive_category == 'educational_specialized':
            factors.append("Specialized educational approach")
        elif self.competitive_category == 'industry_news':
            factors.append("Industry news and business insight focus")

        return factors

    def _identify_competitive_advantages(self, videos: List[Dict[str, Any]]) -> List[str]:
        """Identify potential competitive advantages."""
        advantages = []

        # Channel size advantage
        if self.channel_metadata.get('subscriber_count', 0) > 50000:
            advantages.append(f"Large subscriber base ({self.channel_metadata['subscriber_count']:,} subscribers)")

        # Publishing frequency
        recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 30]
        if len(recent_videos) > 20:
            advantages.append("High publishing frequency")

        # Specialization advantage
        if self.competitive_priority == 'high':
            advantages.append("High competitive priority in HVAC space")

        return advantages

    def _identify_potential_threats(self, videos: List[Dict[str, Any]]) -> List[str]:
        """Identify potential competitive threats."""
        threats = []

        # Content quality threats
        high_quality_videos = sum(1 for v in videos if v.get('content_focus_tags') and len(v['content_focus_tags']) >= 3)
        if high_quality_videos / len(videos) > 0.7:
            threats.append("High proportion of well-categorized, focused content")

        # Rapid content production
        recent_videos = [v for v in videos if v.get('days_since_publish', 999) <= 7]
        if len(recent_videos) > 5:
            threats.append("Very active recent publishing (potential to outpace HKIA)")

        # Specialization threat
        if self.target_audience in ['hvac_technicians', 'refrigeration_specialists']:
            threats.append(f"Direct targeting of {self.target_audience}")

        return threats

    def _assess_market_positioning(self) -> Dict[str, str]:
        """Assess overall market positioning."""
        positioning = {
            'market_segment': self.target_audience,
            'content_strategy': self.competitive_category,
            'competitive_stance': self.competitive_priority
        }

        if self.competitive_priority == 'high':
            positioning['threat_level'] = 'Direct competitor - monitor closely'
        else:
            positioning['threat_level'] = 'Secondary competitor - periodic monitoring'

        return positioning

    def _identify_potential_content_gaps(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Identify potential content gaps that HVAC Know It All could exploit."""
        try:
            # Analyze what content areas are underrepresented
            all_focuses = []
            for video in videos:
                all_focuses.extend(video.get('content_focus_tags', []))

            focus_dist = defaultdict(int)
            for focus in all_focuses:
                focus_dist[focus] += 1

            # Define comprehensive HVAC content areas
            comprehensive_areas = [
                'troubleshooting', 'installation', 'maintenance', 'hvac_systems',
                'refrigeration', 'commercial', 'residential', 'training', 'tools', 'safety'
            ]

            gaps = []
            underrepresented = []

            total_content = len(videos)

            for area in comprehensive_areas:
                area_count = focus_dist.get(area, 0)
                area_percentage = (area_count / total_content) * 100 if total_content > 0 else 0

                if area_count == 0:
                    gaps.append(area)
                elif area_percentage < 10:  # Less than 10% of content
                    underrepresented.append({'area': area, 'percentage': round(area_percentage, 1)})

            return {
                'complete_gaps': gaps,
                'underrepresented_areas': underrepresented,
                'opportunity_score': len(gaps) + len(underrepresented),
                'hkia_opportunities': self._suggest_hkia_opportunities(gaps, underrepresented)
            }

        except (ValueError, KeyError, TypeError) as e:
            self.logger.error(f"Data processing error identifying content gaps: {e}")
            return {}
        except Exception as e:
            self.logger.error(f"Unexpected error identifying content gaps: {e}")
            return {}

    def _suggest_hkia_opportunities(self, gaps: List[str], underrepresented: List[Dict]) -> List[str]:
        """Suggest opportunities for HVAC Know It All based on competitor gaps."""
        opportunities = []

        high_value_areas = ['troubleshooting', 'training', 'hvac_systems', 'tools']

        for gap in gaps:
            if gap in high_value_areas:
                opportunities.append(f"Exploit complete gap in {gap} content")

        for under in underrepresented:
            if under['area'] in high_value_areas and under['percentage'] < 5:
                opportunities.append(f"Dominate underrepresented {under['area']} space ({under['percentage']}% of competitor content)")

        # Specific opportunities based on competitor type
        if self.competitive_category == 'educational_specialized' and 'residential' in gaps:
            opportunities.append("Target residential market gap with beginner-friendly content")

        if self.competitive_category == 'industry_news' and 'hands_on' in gaps:
            opportunities.append("Focus on practical, hands-on content to differentiate")

        return opportunities

    def _log_competitive_insights(self, analysis: Dict[str, Any]):
        """Log key competitive insights for monitoring."""
        try:
            insights = []

            # Publishing insights
            if 'publishing_analysis' in analysis:
                pub_freq = analysis['publishing_analysis'].get('average_frequency_per_day', 0)
                if pub_freq > 0.5:
                    insights.append(f"High publishing frequency: {pub_freq:.1f} videos/day")

            # Content focus insights
            if 'content_analysis' in analysis:
                primary_focus = analysis['content_analysis'].get('primary_content_focus')
                if primary_focus:
                    insights.append(f"Primary focus: {primary_focus}")

            # Competitive positioning
            if 'competitive_positioning' in analysis:
                overlap = analysis['competitive_positioning'].get('content_overlap', {}).get('total_overlap_percentage', 0)
                if overlap > 50:
                    insights.append(f"High content overlap: {overlap}% direct competition")

            # Content gaps
            if 'content_gaps' in analysis:
                opportunity_score = analysis['content_gaps'].get('opportunity_score', 0)
                if opportunity_score > 5:
                    insights.append(f"High opportunity score: {opportunity_score} content gap areas identified")

            # Log insights
            if insights:
                self.logger.info(f"Key competitive insights for {self.competitor_info['name']}:")
                for insight in insights:
                    self.logger.info(f"  • {insight}")

        except (ValueError, KeyError, TypeError) as e:
            self.logger.error(f"Data access error logging competitive insights: {e}")
        except Exception as e:
            self.logger.error(f"Unexpected error logging competitive insights: {e}")

    def _analyze_content_themes(self, videos: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Legacy content theme analysis method - kept for compatibility."""
        # Delegate to enhanced method
        return self._analyze_enhanced_content_themes(videos)


def create_youtube_competitive_scrapers(data_dir: Path, logs_dir: Path) -> Dict[str, YouTubeCompetitiveScraper]:
    """Enhanced factory function to create all YouTube competitive scrapers with comprehensive error handling."""
    import logging

    logger = logging.getLogger(__name__)
    scrapers = {}

    # Initialize centralized quota manager first
    try:
        quota_manager = YouTubeQuotaManager()
        quota_status = quota_manager.get_quota_status()
        logger.info(f"Initialized YouTube quota manager. Status: {quota_status['quota_used']}/{quota_status['quota_limit']} ({quota_status['quota_percentage']:.1f}%)")
    except Exception as e:
        logger.error(f"Failed to initialize YouTube quota manager: {e}")
        return {}

    # Create scrapers for each competitor
    successful_scrapers = []
    failed_scrapers = []

    for competitor_key in YouTubeCompetitiveScraper.COMPETITOR_CHANNELS:
        competitor_info = YouTubeCompetitiveScraper.COMPETITOR_CHANNELS[competitor_key]

        try:
            logger.info(f"Creating YouTube competitive scraper for {competitor_info['name']}...")

            scraper = YouTubeCompetitiveScraper(data_dir, logs_dir, competitor_key)
            scraper_key = f"youtube_{competitor_key}"
            scrapers[scraper_key] = scraper

            successful_scrapers.append({
                'key': scraper_key,
                'name': competitor_info['name'],
                'priority': competitor_info['competitive_priority'],
                'category': competitor_info['category']
            })

            logger.info(f"✓ Successfully created YouTube scraper for {competitor_info['name']}")

        except Exception as e:
            error_msg = f"Failed to create YouTube scraper for {competitor_key} ({competitor_info.get('name', 'Unknown')}): {e}"
            logger.error(error_msg)

            failed_scrapers.append({
                'key': competitor_key,
                'name': competitor_info.get('name', 'Unknown'),
                'error': str(e)
            })

    # Log comprehensive initialization results
    logger.info(f"YouTube competitive scrapers initialization complete:")
    logger.info(f"  ✓ Successfully created: {len(successful_scrapers)} scrapers")

    if successful_scrapers:
        for scraper in successful_scrapers:
            logger.info(f"    - {scraper['name']} ({scraper['priority']} priority, {scraper['category']})")

    if failed_scrapers:
        logger.warning(f"  ✗ Failed to create: {len(failed_scrapers)} scrapers")
        for failed in failed_scrapers:
            logger.warning(f"    - {failed['name']}: {failed['error']}")

    # Log quota status after initialization
    try:
        final_quota_status = quota_manager.get_quota_status()
        logger.info(f"Final quota status: {final_quota_status['quota_used']}/{final_quota_status['quota_limit']} ({final_quota_status['quota_percentage']:.1f}%)")
    except Exception as e:
        logger.warning(f"Could not get final quota status: {e}")

    return scrapers


def create_single_youtube_competitive_scraper(data_dir: Path, logs_dir: Path, competitor_key: str) -> Optional[YouTubeCompetitiveScraper]:
    """Create a single YouTube competitive scraper for testing or selective use."""
    import logging

    logger = logging.getLogger(__name__)

    if competitor_key not in YouTubeCompetitiveScraper.COMPETITOR_CHANNELS:
        logger.error(f"Unknown competitor key: {competitor_key}. Available: {list(YouTubeCompetitiveScraper.COMPETITOR_CHANNELS.keys())}")
        return None

    try:
        competitor_info = YouTubeCompetitiveScraper.COMPETITOR_CHANNELS[competitor_key]
        logger.info(f"Creating single YouTube competitive scraper for {competitor_info['name']}...")

        scraper = YouTubeCompetitiveScraper(data_dir, logs_dir, competitor_key)

        logger.info(f"✓ Successfully created YouTube competitive scraper for {competitor_info['name']}")
        logger.info(f"  Priority: {competitor_info['competitive_priority']}, Category: {competitor_info['category']}")

        return scraper

    except ConfigurationError as e:
        logger.error(f"Configuration error creating YouTube scraper for {competitor_key}: {e}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error creating YouTube competitive scraper for {competitor_key}: {e}")
        return None