hvac-kia-content/src/analytics_base_scraper.py

"""
Analytics Base Scraper

Extends BaseScraper with content analysis capabilities using Claude Haiku,
engagement analysis, and keyword extraction.
"""

import json
import logging
from pathlib import Path
from typing import Dict, List, Any, Optional
from datetime import datetime

from .base_scraper import BaseScraper, ScraperConfig
from .content_analysis import ClaudeHaikuAnalyzer, EngagementAnalyzer, KeywordExtractor


class AnalyticsBaseScraper(BaseScraper):
    """Enhanced BaseScraper with AI-powered content analysis"""

    def __init__(self, config: ScraperConfig, enable_analysis: bool = True):
        """Initialize analytics scraper with content analysis capabilities"""

        super().__init__(config)

        self.enable_analysis = enable_analysis

        # Initialize analyzers if enabled
        if self.enable_analysis:
            try:
                self.claude_analyzer = ClaudeHaikuAnalyzer()
                self.engagement_analyzer = EngagementAnalyzer()
                self.keyword_extractor = KeywordExtractor()

                self.logger.info("Content analysis enabled with Claude Haiku")

            except Exception as e:
                self.logger.warning(f"Content analysis disabled due to error: {e}")
                self.enable_analysis = False

        # Analytics state file
        self.analytics_state_file = (
            config.data_dir / ".state" / f"{config.source_name}_analytics_state.json"
        )
        self.analytics_state_file.parent.mkdir(parents=True, exist_ok=True)

    def fetch_content_with_analysis(self, **kwargs) -> List[Dict[str, Any]]:
        """Fetch content and perform analysis"""

        # Fetch content using the original scraper method
        content_items = self.fetch_content(**kwargs)

        if not content_items or not self.enable_analysis:
            return content_items

        self.logger.info(f"Analyzing {len(content_items)} content items with AI")

        # Perform content analysis
        analyzed_items = []

        for item in content_items:
            try:
                analyzed_item = self._analyze_content_item(item)
                analyzed_items.append(analyzed_item)

            except Exception as e:
                self.logger.error(f"Error analyzing item {item.get('id')}: {e}")
                # Include original item without analysis
                analyzed_items.append(item)

        # Update analytics state
        self._update_analytics_state(analyzed_items)

        return analyzed_items

    def _analyze_content_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze a single content item with AI"""

        analyzed_item = item.copy()

        try:
            # Content classification with Claude Haiku
            content_analysis = self.claude_analyzer.analyze_content(item)

            # Add analysis results to item
            analyzed_item['ai_analysis'] = {
                'topics': content_analysis.topics,
                'products': content_analysis.products,
                'difficulty': content_analysis.difficulty,
                'content_type': content_analysis.content_type,
                'sentiment': content_analysis.sentiment,
                'keywords': content_analysis.keywords,
                'hvac_relevance': content_analysis.hvac_relevance,
                'engagement_prediction': content_analysis.engagement_prediction,
                'analyzed_at': datetime.now().isoformat()
            }

        except Exception as e:
            self.logger.error(f"Claude analysis failed for {item.get('id')}: {e}")
            analyzed_item['ai_analysis'] = {
                'error': str(e),
                'analyzed_at': datetime.now().isoformat()
            }

        try:
            # Keyword extraction
            keyword_analysis = self.keyword_extractor.extract_keywords(item)

            analyzed_item['keyword_analysis'] = {
                'primary_keywords': keyword_analysis.primary_keywords,
                'technical_terms': keyword_analysis.technical_terms,
                'product_keywords': keyword_analysis.product_keywords,
                'seo_keywords': keyword_analysis.seo_keywords,
                'keyword_density': keyword_analysis.keyword_density
            }

        except Exception as e:
            self.logger.error(f"Keyword extraction failed for {item.get('id')}: {e}")
            analyzed_item['keyword_analysis'] = {'error': str(e)}

        return analyzed_item

    def calculate_engagement_metrics(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Calculate engagement metrics for content items"""

        if not self.enable_analysis or not items:
            return {}

        try:
            # Analyze engagement patterns
            engagement_metrics = self.engagement_analyzer.analyze_engagement_metrics(
                items, self.config.source_name
            )

            # Identify trending content
            trending_content = self.engagement_analyzer.identify_trending_content(
                items, self.config.source_name
            )

            # Calculate source summary
            source_summary = self.engagement_analyzer.calculate_source_summary(
                items, self.config.source_name
            )

            return {
                'source_summary': source_summary,
                'trending_content': [
                    {
                        'content_id': t.content_id,
                        'title': t.title,
                        'engagement_score': t.engagement_score,
                        'velocity_score': t.velocity_score,
                        'trend_type': t.trend_type
                    } for t in trending_content
                ],
                'high_performers': [
                    {
                        'content_id': m.content_id,
                        'engagement_rate': m.engagement_rate,
                        'virality_score': m.virality_score,
                        'relative_performance': m.relative_performance
                    } for m in engagement_metrics if m.relative_performance > 1.5
                ]
            }

        except Exception as e:
            self.logger.error(f"Engagement analysis failed: {e}")
            return {'error': str(e)}

    def identify_content_opportunities(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Identify content opportunities and gaps"""

        if not self.enable_analysis or not items:
            return {}

        try:
            # Extract trending keywords
            trending_keywords = self.keyword_extractor.identify_trending_keywords(items)

            # Analyze topic distribution
            topics = []
            difficulties = []
            content_types = []

            for item in items:
                analysis = item.get('ai_analysis', {})
                if 'topics' in analysis:
                    topics.extend(analysis['topics'])
                if 'difficulty' in analysis:
                    difficulties.append(analysis['difficulty'])
                if 'content_type' in analysis:
                    content_types.append(analysis['content_type'])

            # Identify gaps
            topic_counts = {}
            for topic in topics:
                topic_counts[topic] = topic_counts.get(topic, 0) + 1

            difficulty_counts = {}
            for difficulty in difficulties:
                difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1

            content_type_counts = {}
            for content_type in content_types:
                content_type_counts[content_type] = content_type_counts.get(content_type, 0) + 1

            # Expected high-value topics for HVAC
            expected_topics = [
                'heat_pumps', 'troubleshooting', 'installation', 'maintenance',
                'refrigerants', 'electrical', 'smart_hvac', 'tools'
            ]

            content_gaps = [
                topic for topic in expected_topics
                if topic_counts.get(topic, 0) < 2
            ]

            return {
                'trending_keywords': [
                    {'keyword': kw, 'frequency': freq}
                    for kw, freq in trending_keywords[:10]
                ],
                'topic_distribution': topic_counts,
                'difficulty_distribution': difficulty_counts,
                'content_type_distribution': content_type_counts,
                'content_gaps': content_gaps,
                'opportunities': [
                    f"Create more {gap.replace('_', ' ')} content"
                    for gap in content_gaps[:5]
                ]
            }

        except Exception as e:
            self.logger.error(f"Content opportunity analysis failed: {e}")
            return {'error': str(e)}

    def format_analytics_markdown(self, items: List[Dict[str, Any]]) -> str:
        """Format content with analytics data as enhanced markdown"""

        if not items:
            return "No content items to format."

        # Calculate analytics summary
        engagement_metrics = self.calculate_engagement_metrics(items)
        content_opportunities = self.identify_content_opportunities(items)

        # Build enhanced markdown
        markdown_parts = []

        # Analytics Summary Header
        markdown_parts.append("# Content Analytics Summary")
        markdown_parts.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        markdown_parts.append(f"Source: {self.config.source_name.title()}")
        markdown_parts.append(f"Total Items: {len(items)}")

        if self.enable_analysis:
            markdown_parts.append(f"AI Analysis: Enabled (Claude Haiku)")
        else:
            markdown_parts.append(f"AI Analysis: Disabled")

        markdown_parts.append("\n---\n")

        # Engagement Summary
        if engagement_metrics and 'source_summary' in engagement_metrics:
            summary = engagement_metrics['source_summary']
            markdown_parts.append("## Engagement Summary")
            markdown_parts.append(f"- Average Engagement Rate: {summary.get('avg_engagement_rate', 0):.4f}")
            markdown_parts.append(f"- Total Engagement: {summary.get('total_engagement', 0):,}")
            markdown_parts.append(f"- Trending Items: {summary.get('trending_count', 0)}")
            markdown_parts.append(f"- High Performers: {summary.get('high_performers', 0)}")
            markdown_parts.append("")

        # Content Opportunities
        if content_opportunities and 'opportunities' in content_opportunities:
            markdown_parts.append("## Content Opportunities")
            for opp in content_opportunities['opportunities'][:5]:
                markdown_parts.append(f"- {opp}")
            markdown_parts.append("")

        # Trending Keywords
        if content_opportunities and 'trending_keywords' in content_opportunities:
            keywords = content_opportunities['trending_keywords'][:5]
            if keywords:
                markdown_parts.append("## Trending Keywords")
                for kw_data in keywords:
                    markdown_parts.append(f"- {kw_data['keyword']} ({kw_data['frequency']} mentions)")
                markdown_parts.append("")

        markdown_parts.append("\n---\n")

        # Individual Content Items
        for i, item in enumerate(items, 1):
            markdown_parts.append(self._format_analyzed_item(item, i))

        return '\n'.join(markdown_parts)

    def _format_analyzed_item(self, item: Dict[str, Any], index: int) -> str:
        """Format individual analyzed content item as markdown"""

        parts = []

        # Basic item info
        parts.append(f"# ID: {item.get('id', f'item_{index}')}")

        if title := item.get('title'):
            parts.append(f"## Title: {title}")

        if item.get('type'):
            parts.append(f"## Type: {item.get('type')}")

        if item.get('author'):
            parts.append(f"## Author: {item.get('author')}")

        # AI Analysis Results
        if ai_analysis := item.get('ai_analysis'):
            if 'error' not in ai_analysis:
                parts.append("## AI Analysis")

                if topics := ai_analysis.get('topics'):
                    parts.append(f"**Topics**: {', '.join(topics)}")

                if products := ai_analysis.get('products'):
                    parts.append(f"**Products**: {', '.join(products)}")

                parts.append(f"**Difficulty**: {ai_analysis.get('difficulty', 'Unknown')}")
                parts.append(f"**Content Type**: {ai_analysis.get('content_type', 'Unknown')}")
                parts.append(f"**Sentiment**: {ai_analysis.get('sentiment', 0):.2f}")
                parts.append(f"**HVAC Relevance**: {ai_analysis.get('hvac_relevance', 0):.2f}")
                parts.append(f"**Engagement Prediction**: {ai_analysis.get('engagement_prediction', 0):.2f}")

                if keywords := ai_analysis.get('keywords'):
                    parts.append(f"**Keywords**: {', '.join(keywords)}")

                parts.append("")

        # Keyword Analysis
        if keyword_analysis := item.get('keyword_analysis'):
            if 'error' not in keyword_analysis:
                if seo_keywords := keyword_analysis.get('seo_keywords'):
                    parts.append(f"**SEO Keywords**: {', '.join(seo_keywords)}")

                if technical_terms := keyword_analysis.get('technical_terms'):
                    parts.append(f"**Technical Terms**: {', '.join(technical_terms[:5])}")

                parts.append("")

        # Original content fields
        original_markdown = self.format_markdown([item])

        # Extract content after the first header
        if '\n## ' in original_markdown:
            content_start = original_markdown.find('\n## ')
            original_content = original_markdown[content_start:]
            parts.append(original_content)

        parts.append("\n" + "="*80 + "\n")

        return '\n'.join(parts)

    def _update_analytics_state(self, analyzed_items: List[Dict[str, Any]]) -> None:
        """Update analytics state with analysis results"""

        try:
            # Load existing state
            analytics_state = {}
            if self.analytics_state_file.exists():
                with open(self.analytics_state_file, 'r', encoding='utf-8') as f:
                    analytics_state = json.load(f)

            # Update with current analysis
            analytics_state.update({
                'last_analysis_run': datetime.now().isoformat(),
                'items_analyzed': len(analyzed_items),
                'analysis_enabled': self.enable_analysis,
                'total_items_analyzed': analytics_state.get('total_items_analyzed', 0) + len(analyzed_items)
            })

            # Save updated state
            with open(self.analytics_state_file, 'w', encoding='utf-8') as f:
                json.dump(analytics_state, f, indent=2)

        except Exception as e:
            self.logger.error(f"Error updating analytics state: {e}")

    def get_analytics_state(self) -> Dict[str, Any]:
        """Get current analytics state"""

        if not self.analytics_state_file.exists():
            return {}

        try:
            with open(self.analytics_state_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            self.logger.error(f"Error reading analytics state: {e}")
            return {}