hvac-kia-content/tests/test_engagement_analyzer.py

#!/usr/bin/env python3
"""
Comprehensive Unit Tests for Engagement Analyzer

Tests engagement metrics calculation, trending content identification,
virality scoring, and source-specific analysis.
"""

import pytest
from unittest.mock import Mock, patch
from datetime import datetime, timedelta
from pathlib import Path
import sys

# Add src to path for imports
if str(Path(__file__).parent.parent) not in sys.path:
    sys.path.insert(0, str(Path(__file__).parent.parent))

from src.content_analysis.engagement_analyzer import (
    EngagementAnalyzer,
    EngagementMetrics,
    TrendingContent
)


class TestEngagementAnalyzer:
    """Test suite for EngagementAnalyzer"""

    @pytest.fixture
    def analyzer(self):
        """Create engagement analyzer instance"""
        return EngagementAnalyzer()

    @pytest.fixture
    def sample_youtube_items(self):
        """Sample YouTube content items with engagement data"""
        return [
            {
                'id': 'video1',
                'title': 'HVAC Troubleshooting Guide',
                'source': 'youtube',
                'views': 10000,
                'likes': 500,
                'comments': 50,
                'upload_date': '2025-08-27'
            },
            {
                'id': 'video2',
                'title': 'Heat Pump Installation',
                'source': 'youtube',
                'views': 5000,
                'likes': 200,
                'comments': 20,
                'upload_date': '2025-08-26'
            },
            {
                'id': 'video3',
                'title': 'AC Repair Tips',
                'source': 'youtube',
                'views': 1000,
                'likes': 30,
                'comments': 5,
                'upload_date': '2025-08-25'
            }
        ]

    @pytest.fixture
    def sample_instagram_items(self):
        """Sample Instagram content items"""
        return [
            {
                'id': 'post1',
                'title': 'HVAC tools showcase',
                'source': 'instagram',
                'likes': 150,
                'comments': 25,
                'upload_date': '2025-08-27'
            },
            {
                'id': 'post2',
                'title': 'Before and after AC install',
                'source': 'instagram',
                'likes': 80,
                'comments': 10,
                'upload_date': '2025-08-26'
            }
        ]

    def test_calculate_engagement_rate_youtube(self, analyzer):
        """Test engagement rate calculation for YouTube content"""

        # Test normal case
        item = {'views': 1000, 'likes': 50, 'comments': 10}
        rate = analyzer._calculate_engagement_rate(item, 'youtube')
        assert rate == 0.06  # (50 + 10) / 1000

        # Test zero views
        item = {'views': 0, 'likes': 50, 'comments': 10}
        rate = analyzer._calculate_engagement_rate(item, 'youtube')
        assert rate == 0

        # Test missing engagement data
        item = {'views': 1000}
        rate = analyzer._calculate_engagement_rate(item, 'youtube')
        assert rate == 0

    def test_calculate_engagement_rate_instagram(self, analyzer):
        """Test engagement rate calculation for Instagram content"""

        # Test with views, likes and comments (preferred method)
        item = {'views': 1000, 'likes': 100, 'comments': 20}
        rate = analyzer._calculate_engagement_rate(item, 'instagram')
        # Should use (likes + comments) / views: (100 + 20) / 1000 = 0.12
        assert rate == 0.12

        # Test with likes and comments but no views (fallback)
        item = {'likes': 100, 'comments': 20}
        rate = analyzer._calculate_engagement_rate(item, 'instagram')
        # Should use comments/likes fallback: 20/100 = 0.2
        assert rate == 0.2

        # Test with only comments (no likes, no views)
        item = {'comments': 10}
        rate = analyzer._calculate_engagement_rate(item, 'instagram')
        # Should return 0 as there are no likes to calculate fallback
        assert rate == 0.0

    def test_get_total_engagement(self, analyzer):
        """Test total engagement calculation"""

        # Test YouTube (likes + comments)
        item = {'likes': 50, 'comments': 10}
        total = analyzer._get_total_engagement(item, 'youtube')
        assert total == 60

        # Test Instagram (likes + comments)
        item = {'likes': 100, 'comments': 25}
        total = analyzer._get_total_engagement(item, 'instagram')
        assert total == 125

        # Test missing data
        item = {}
        total = analyzer._get_total_engagement(item, 'youtube')
        assert total == 0

    def test_analyze_source_engagement_youtube(self, analyzer, sample_youtube_items):
        """Test source engagement analysis for YouTube"""

        result = analyzer.analyze_source_engagement(sample_youtube_items, 'youtube')

        # Verify structure
        assert 'total_items' in result
        assert 'avg_engagement_rate' in result
        assert 'median_engagement_rate' in result
        assert 'total_engagement' in result
        assert 'trending_count' in result
        assert 'high_performers' in result
        assert 'trending_content' in result

        # Verify calculations
        assert result['total_items'] == 3
        assert result['total_engagement'] == 805  # 550 + 220 + 35

        # Check engagement rates are calculated correctly
        # video1: (500+50)/10000 = 0.055, video2: (200+20)/5000 = 0.044, video3: (30+5)/1000 = 0.035
        expected_avg = (0.055 + 0.044 + 0.035) / 3
        assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001

        # Check high performers (threshold 0.05 for YouTube)
        assert result['high_performers'] == 1  # Only video1 above 0.05

    def test_analyze_source_engagement_instagram(self, analyzer, sample_instagram_items):
        """Test source engagement analysis for Instagram"""

        result = analyzer.analyze_source_engagement(sample_instagram_items, 'instagram')

        assert result['total_items'] == 2
        assert result['total_engagement'] == 265  # 175 + 90

        # Instagram uses comments/likes: post1: 25/150=0.167, post2: 10/80=0.125
        expected_avg = (0.167 + 0.125) / 2
        assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001

    def test_identify_trending_content(self, analyzer, sample_youtube_items):
        """Test trending content identification"""

        trending = analyzer.identify_trending_content(sample_youtube_items, 'youtube')

        # Should identify high-engagement content
        assert len(trending) > 0

        # Check trending content structure
        if trending:
            item = trending[0]
            assert 'content_id' in item
            assert 'source' in item
            assert 'title' in item
            assert 'engagement_score' in item
            assert 'trend_type' in item

    def test_calculate_virality_score(self, analyzer):
        """Test virality score calculation"""

        # High engagement, recent content
        item = {
            'views': 10000,
            'likes': 800,
            'comments': 200,
            'upload_date': '2025-08-27'
        }
        score = analyzer._calculate_virality_score(item, 'youtube')
        assert score > 0

        # Low engagement content
        item = {
            'views': 100,
            'likes': 5,
            'comments': 1,
            'upload_date': '2025-08-27'
        }
        score = analyzer._calculate_virality_score(item, 'youtube')
        assert score >= 0

    def test_get_engagement_velocity(self, analyzer):
        """Test engagement velocity calculation"""

        # Recent high-engagement content
        item = {
            'views': 5000,
            'upload_date': '2025-08-27'
        }

        with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
            mock_datetime.now.return_value = datetime(2025, 8, 28)
            mock_datetime.strptime = datetime.strptime

            velocity = analyzer._get_engagement_velocity(item)
            assert velocity == 5000  # 5000 views / 1 day

        # Older content
        item = {
            'views': 1000,
            'upload_date': '2025-08-25'
        }

        with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
            mock_datetime.now.return_value = datetime(2025, 8, 28)
            mock_datetime.strptime = datetime.strptime

            velocity = analyzer._get_engagement_velocity(item)
            assert velocity == 333.33  # 1000 views / 3 days (rounded)

    def test_empty_content_list(self, analyzer):
        """Test handling of empty content lists"""

        result = analyzer.analyze_source_engagement([], 'youtube')

        assert result['total_items'] == 0
        assert result['avg_engagement_rate'] == 0
        assert result['median_engagement_rate'] == 0
        assert result['total_engagement'] == 0
        assert result['trending_count'] == 0
        assert result['high_performers'] == 0
        assert result['trending_content'] == []

    def test_missing_engagement_data(self, analyzer):
        """Test handling of content with missing engagement data"""

        items = [
            {'id': 'test1', 'title': 'Test', 'source': 'youtube'},  # No engagement data
            {'id': 'test2', 'title': 'Test 2', 'source': 'youtube', 'views': 0}  # Zero views
        ]

        result = analyzer.analyze_source_engagement(items, 'youtube')

        assert result['total_items'] == 2
        assert result['avg_engagement_rate'] == 0
        assert result['total_engagement'] == 0

    def test_engagement_thresholds_configuration(self, analyzer):
        """Test engagement threshold configuration for different sources"""

        # Check YouTube thresholds
        youtube_thresholds = analyzer.engagement_thresholds['youtube']
        assert 'high_engagement_rate' in youtube_thresholds
        assert 'viral_threshold' in youtube_thresholds
        assert 'view_velocity_threshold' in youtube_thresholds

        # Check Instagram thresholds
        instagram_thresholds = analyzer.engagement_thresholds['instagram']
        assert 'high_engagement_rate' in instagram_thresholds
        assert 'viral_threshold' in instagram_thresholds

    def test_wordpress_engagement_analysis(self, analyzer):
        """Test WordPress content engagement analysis"""

        items = [
            {
                'id': 'post1',
                'title': 'HVAC Blog Post',
                'source': 'wordpress',
                'comments': 15,
                'upload_date': '2025-08-27'
            }
        ]

        result = analyzer.analyze_source_engagement(items, 'wordpress')
        assert result['total_items'] == 1
        # WordPress uses estimated views from comments
        assert result['total_engagement'] == 15

    def test_podcast_engagement_analysis(self, analyzer):
        """Test podcast content engagement analysis"""

        items = [
            {
                'id': 'episode1',
                'title': 'HVAC Podcast Episode',
                'source': 'podcast',
                'upload_date': '2025-08-27'
            }
        ]

        result = analyzer.analyze_source_engagement(items, 'podcast')
        assert result['total_items'] == 1
        # Podcast typically has minimal engagement data
        assert result['total_engagement'] == 0

    def test_edge_case_numeric_conversions(self, analyzer):
        """Test edge cases in numeric field handling"""

        # Test string numeric values
        item = {'views': '1,000', 'likes': '50', 'comments': '10'}
        rate = analyzer._calculate_engagement_rate(item, 'youtube')
        # Should handle string conversion: (50+10)/1000 = 0.06
        assert rate == 0.06

        # Test None values
        item = {'views': None, 'likes': None, 'comments': None}
        rate = analyzer._calculate_engagement_rate(item, 'youtube')
        assert rate == 0

    def test_trending_content_types(self, analyzer):
        """Test different types of trending content classification"""

        # High engagement, recent = viral
        viral_item = {
            'id': 'viral1',
            'title': 'Viral HVAC Video',
            'views': 100000,
            'likes': 5000,
            'comments': 500,
            'upload_date': '2025-08-27'
        }

        # Steady growth
        steady_item = {
            'id': 'steady1',
            'title': 'Steady HVAC Content',
            'views': 10000,
            'likes': 300,
            'comments': 30,
            'upload_date': '2025-08-25'
        }

        items = [viral_item, steady_item]
        trending = analyzer.identify_trending_content(items, 'youtube')

        # Should identify trending content with proper classification
        assert len(trending) > 0

        # Check for viral classification
        viral_found = any(item.get('trend_type') == 'viral' for item in trending)
        # Note: This might not always trigger depending on thresholds, so we test structure
        for item in trending:
            assert item['trend_type'] in ['viral', 'steady_growth', 'spike']


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--cov=src.content_analysis.engagement_analyzer", "--cov-report=term-missing"])