hvac-kia-content/tests/test_intelligence_aggregator.py

#!/usr/bin/env python3
"""
Comprehensive Unit Tests for Intelligence Aggregator

Tests intelligence report generation, markdown parsing,
content analysis coordination, and strategic insights.
"""

import pytest
from unittest.mock import Mock, patch, mock_open
from pathlib import Path
from datetime import datetime, timedelta
import json
import sys

# Add src to path for imports
if str(Path(__file__).parent.parent) not in sys.path:
    sys.path.insert(0, str(Path(__file__).parent.parent))

from src.content_analysis.intelligence_aggregator import IntelligenceAggregator


class TestIntelligenceAggregator:
    """Test suite for IntelligenceAggregator"""

    @pytest.fixture
    def temp_data_dir(self, tmp_path):
        """Create temporary data directory structure"""
        data_dir = tmp_path / "data"
        data_dir.mkdir()

        # Create required subdirectories
        (data_dir / "intelligence" / "daily").mkdir(parents=True)
        (data_dir / "intelligence" / "weekly").mkdir(parents=True)
        (data_dir / "intelligence" / "monthly").mkdir(parents=True)
        (data_dir / "markdown_current").mkdir()

        return data_dir

    @pytest.fixture
    def aggregator(self, temp_data_dir):
        """Create intelligence aggregator instance with temp directory"""
        return IntelligenceAggregator(temp_data_dir)

    @pytest.fixture
    def sample_markdown_content(self):
        """Sample markdown content for testing parsing"""
        return """# ID: video1

## Title: HVAC Installation Guide

## Type: video

## Author: HVAC Know It All

## Link: https://www.youtube.com/watch?v=video1

## Upload Date: 2025-08-27

## Views: 5000

## Likes: 250

## Comments: 30

## Engagement Rate: 5.6%

## Description:
Learn professional HVAC installation techniques in this comprehensive guide.

# ID: video2

## Title: Heat Pump Maintenance

## Type: video

## Views: 3000

## Likes: 150

## Comments: 20

## Description:
Essential heat pump maintenance procedures for optimal performance.
"""

    @pytest.fixture
    def sample_content_items(self):
        """Sample content items for testing analysis"""
        return [
            {
                'id': 'item1',
                'title': 'HVAC Installation Guide',
                'source': 'youtube',
                'views': 5000,
                'likes': 250,
                'comments': 30,
                'content': 'Professional HVAC installation techniques, heat pump setup, refrigeration cycle',
                'upload_date': '2025-08-27'
            },
            {
                'id': 'item2',
                'title': 'AC Troubleshooting',
                'source': 'wordpress',
                'likes': 45,
                'comments': 8,
                'content': 'Air conditioning repair, compressor issues, refrigerant leaks',
                'upload_date': '2025-08-26'
            },
            {
                'id': 'item3',
                'title': 'Smart Thermostat Install',
                'source': 'instagram',
                'likes': 120,
                'comments': 15,
                'content': 'Smart thermostat wiring, HVAC controls, energy efficiency',
                'upload_date': '2025-08-25'
            }
        ]

    def test_initialization(self, temp_data_dir):
        """Test aggregator initialization and directory creation"""

        aggregator = IntelligenceAggregator(temp_data_dir)

        assert aggregator.data_dir == temp_data_dir
        assert aggregator.intelligence_dir == temp_data_dir / "intelligence"
        assert aggregator.intelligence_dir.exists()
        assert (aggregator.intelligence_dir / "daily").exists()
        assert (aggregator.intelligence_dir / "weekly").exists()
        assert (aggregator.intelligence_dir / "monthly").exists()

    def test_parse_markdown_file(self, aggregator, temp_data_dir, sample_markdown_content):
        """Test markdown file parsing"""

        # Create test markdown file
        md_file = temp_data_dir / "markdown_current" / "hkia_youtube_test.md"
        md_file.write_text(sample_markdown_content, encoding='utf-8')

        items = aggregator._parse_markdown_file(md_file)

        assert len(items) == 2

        # Check first item
        item1 = items[0]
        assert item1['id'] == 'video1'
        assert item1['title'] == 'HVAC Installation Guide'
        assert item1['source'] == 'youtube'
        assert item1['views'] == 5000
        assert item1['likes'] == 250
        assert item1['comments'] == 30

        # Check second item
        item2 = items[1]
        assert item2['id'] == 'video2'
        assert item2['title'] == 'Heat Pump Maintenance'
        assert item2['views'] == 3000

    def test_parse_content_item(self, aggregator):
        """Test individual content item parsing"""

        item_content = """video1

## Title: Test Video

## Views: 1,500

## Likes: 75

## Comments: 10

## Description:
Test video description here.
"""

        item = aggregator._parse_content_item(item_content, "youtube_test")

        assert item['id'] == 'video1'
        assert item['title'] == 'Test Video'
        assert item['views'] == 1500  # Comma should be removed
        assert item['likes'] == 75
        assert item['comments'] == 10
        assert item['source'] == 'youtube'

    def test_extract_numeric_fields(self, aggregator):
        """Test numeric field extraction and conversion"""

        item = {
            'views': '10,000',
            'likes': '500',
            'comments': '50',
            'invalid_number': 'abc'
        }

        aggregator._extract_numeric_fields(item)

        assert item['views'] == 10000
        assert item['likes'] == 500
        assert item['comments'] == 50
        # Invalid numbers should become 0
        # Note: 'invalid_number' not in numeric_fields list, so unchanged

    def test_extract_source_from_filename(self, aggregator):
        """Test source extraction from filenames"""

        assert aggregator._extract_source_from_filename("hkia_youtube_20250827") == "youtube"
        assert aggregator._extract_source_from_filename("hkia_instagram_test") == "instagram"
        assert aggregator._extract_source_from_filename("hkia_wordpress_latest") == "wordpress"
        assert aggregator._extract_source_from_filename("hkia_mailchimp_feed") == "mailchimp"
        assert aggregator._extract_source_from_filename("hkia_podcast_episode") == "podcast"
        assert aggregator._extract_source_from_filename("hkia_hvacrschool_article") == "hvacrschool"
        assert aggregator._extract_source_from_filename("unknown_source") == "unknown"

    @patch('src.content_analysis.intelligence_aggregator.IntelligenceAggregator._load_hkia_content')
    @patch('src.content_analysis.intelligence_aggregator.IntelligenceAggregator._analyze_hkia_content')
    def test_generate_daily_intelligence(self, mock_analyze, mock_load, aggregator, sample_content_items):
        """Test daily intelligence report generation"""

        # Mock content loading
        mock_load.return_value = sample_content_items

        # Mock analysis results
        mock_analyze.return_value = {
            'content_classified': 3,
            'topic_distribution': {'hvac_systems': {'count': 2}, 'maintenance': {'count': 1}},
            'engagement_summary': {'youtube': {'total_items': 1}},
            'trending_keywords': [{'keyword': 'hvac', 'frequency': 3}],
            'content_gaps': [],
            'sentiment_overview': {'avg_sentiment': 0.5}
        }

        # Generate report
        test_date = datetime(2025, 8, 28)
        report = aggregator.generate_daily_intelligence(test_date)

        # Verify report structure
        assert 'report_date' in report
        assert 'generated_at' in report
        assert 'hkia_analysis' in report
        assert 'competitor_analysis' in report
        assert 'strategic_insights' in report
        assert 'meta' in report

        assert report['report_date'] == '2025-08-28'
        assert report['meta']['total_hkia_items'] == 3

    def test_load_hkia_content_no_files(self, aggregator, temp_data_dir):
        """Test content loading when no markdown files exist"""

        test_date = datetime(2025, 8, 28)
        content = aggregator._load_hkia_content(test_date)

        assert content == []

    def test_load_hkia_content_with_files(self, aggregator, temp_data_dir, sample_markdown_content):
        """Test content loading with markdown files"""

        # Create test files
        md_dir = temp_data_dir / "markdown_current"
        (md_dir / "hkia_youtube_20250827.md").write_text(sample_markdown_content)
        (md_dir / "hkia_instagram_20250827.md").write_text("# ID: post1\n\n## Title: Test Post")

        test_date = datetime(2025, 8, 28)
        content = aggregator._load_hkia_content(test_date)

        assert len(content) >= 2  # Should load from both files

    @patch('src.content_analysis.intelligence_aggregator.ClaudeHaikuAnalyzer')
    def test_analyze_hkia_content_with_claude(self, mock_claude_class, aggregator, sample_content_items):
        """Test HKIA content analysis with Claude analyzer"""

        # Mock Claude analyzer
        mock_analyzer = Mock()
        mock_analyzer.analyze_content_batch.return_value = [
            {'topics': ['hvac_systems'], 'sentiment': 0.7, 'difficulty': 'intermediate'},
            {'topics': ['maintenance'], 'sentiment': 0.5, 'difficulty': 'beginner'},
            {'topics': ['controls'], 'sentiment': 0.6, 'difficulty': 'advanced'}
        ]
        mock_claude_class.return_value = mock_analyzer

        # Re-initialize aggregator to enable Claude analyzer
        aggregator.claude_analyzer = mock_analyzer

        result = aggregator._analyze_hkia_content(sample_content_items)

        assert result['content_classified'] == 3
        assert 'topic_distribution' in result
        assert 'engagement_summary' in result
        assert 'trending_keywords' in result

    def test_analyze_hkia_content_without_claude(self, aggregator, sample_content_items):
        """Test HKIA content analysis without Claude analyzer (fallback mode)"""

        # Ensure no Claude analyzer
        aggregator.claude_analyzer = None

        result = aggregator._analyze_hkia_content(sample_content_items)

        assert result['content_classified'] == 0
        assert 'topic_distribution' in result
        assert 'engagement_summary' in result
        assert 'trending_keywords' in result

        # Should still have engagement analysis and keyword extraction
        assert len(result['engagement_summary']) > 0

    def test_calculate_topic_distribution(self, aggregator):
        """Test topic distribution calculation"""

        analyses = [
            {'topics': ['hvac_systems'], 'sentiment': 0.7},
            {'topics': ['hvac_systems', 'maintenance'], 'sentiment': 0.5},
            {'topics': ['maintenance'], 'sentiment': 0.6}
        ]

        distribution = aggregator._calculate_topic_distribution(analyses)

        assert 'hvac_systems' in distribution
        assert 'maintenance' in distribution
        assert distribution['hvac_systems']['count'] == 2
        assert distribution['maintenance']['count'] == 2
        assert abs(distribution['hvac_systems']['avg_sentiment'] - 0.6) < 0.1

    def test_calculate_sentiment_overview(self, aggregator):
        """Test sentiment overview calculation"""

        analyses = [
            {'sentiment': 0.7},
            {'sentiment': 0.5},
            {'sentiment': 0.6}
        ]

        overview = aggregator._calculate_sentiment_overview(analyses)

        assert 'avg_sentiment' in overview
        assert 'sentiment_distribution' in overview
        assert abs(overview['avg_sentiment'] - 0.6) < 0.1

    def test_identify_content_gaps(self, aggregator):
        """Test content gap identification"""

        topic_distribution = {
            'hvac_systems': {'count': 10},
            'maintenance': {'count': 1},  # Low coverage
            'installation': {'count': 8},
            'troubleshooting': {'count': 1}  # Low coverage
        }

        gaps = aggregator._identify_content_gaps(topic_distribution)

        assert len(gaps) > 0
        assert any('maintenance' in gap for gap in gaps)
        assert any('troubleshooting' in gap for gap in gaps)

    def test_generate_strategic_insights(self, aggregator):
        """Test strategic insights generation"""

        hkia_analysis = {
            'topic_distribution': {
                'maintenance': {'count': 1},
                'installation': {'count': 8}
            },
            'trending_keywords': [{'keyword': 'heat pump', 'frequency': 20}],
            'engagement_summary': {
                'youtube': {'avg_engagement_rate': 0.02}
            },
            'sentiment_overview': {'avg_sentiment': 0.3}
        }

        competitor_analysis = {}

        insights = aggregator._generate_strategic_insights(hkia_analysis, competitor_analysis)

        assert 'content_opportunities' in insights
        assert 'performance_insights' in insights
        assert 'competitive_advantages' in insights
        assert 'areas_for_improvement' in insights

        # Should identify content opportunities based on trending keywords
        assert len(insights['content_opportunities']) > 0

    def test_save_intelligence_report(self, aggregator, temp_data_dir):
        """Test intelligence report saving"""

        report = {
            'report_date': '2025-08-28',
            'test_data': 'sample'
        }

        test_date = datetime(2025, 8, 28)
        saved_file = aggregator._save_intelligence_report(report, test_date, 'daily')

        assert saved_file.exists()
        assert 'hkia_intelligence_2025-08-28.json' in saved_file.name

        # Verify content
        with open(saved_file, 'r') as f:
            saved_report = json.load(f)
        assert saved_report['report_date'] == '2025-08-28'

    def test_generate_weekly_intelligence(self, aggregator, temp_data_dir):
        """Test weekly intelligence generation"""

        # Create sample daily reports
        daily_dir = temp_data_dir / "intelligence" / "daily"

        for i in range(7):
            date = datetime(2025, 8, 21) + timedelta(days=i)
            date_str = date.strftime('%Y-%m-%d')
            report = {
                'report_date': date_str,
                'hkia_analysis': {
                    'content_classified': 10,
                    'trending_keywords': [{'keyword': 'hvac', 'frequency': 5}]
                },
                'meta': {'total_hkia_items': 100}
            }

            report_file = daily_dir / f"hkia_intelligence_{date_str}.json"
            with open(report_file, 'w') as f:
                json.dump(report, f)

        # Generate weekly report
        end_date = datetime(2025, 8, 28)
        weekly_report = aggregator.generate_weekly_intelligence(end_date)

        assert 'period_start' in weekly_report
        assert 'period_end' in weekly_report
        assert 'summary' in weekly_report
        assert 'daily_reports_included' in weekly_report

    def test_error_handling_file_operations(self, aggregator):
        """Test error handling in file operations"""

        # Test parsing non-existent file
        fake_file = Path("/nonexistent/file.md")
        items = aggregator._parse_markdown_file(fake_file)
        assert items == []

        # Test parsing malformed content
        malformed_content = "This is not properly formatted markdown"
        item = aggregator._parse_content_item(malformed_content, "test")
        assert item is None

    def test_empty_content_analysis(self, aggregator):
        """Test analysis with empty content list"""

        result = aggregator._analyze_hkia_content([])

        assert result['content_classified'] == 0
        assert result['topic_distribution'] == {}
        assert result['trending_keywords'] == []
        assert result['content_gaps'] == []

    @patch('builtins.open', side_effect=IOError("File access error"))
    def test_file_access_error_handling(self, mock_open, aggregator, temp_data_dir):
        """Test handling of file access errors"""

        test_date = datetime(2025, 8, 28)

        # Should handle file access errors gracefully
        content = aggregator._load_hkia_content(test_date)
        assert content == []

    def test_numeric_field_edge_cases(self, aggregator):
        """Test numeric field extraction edge cases"""

        item = {
            'views': '',  # Empty string
            'likes': 'N/A',  # Non-numeric string
            'comments': None,  # None value
            'view_count': '1.5K'  # Non-standard format
        }

        aggregator._extract_numeric_fields(item)

        # All should convert to 0 for invalid formats
        assert item['views'] == 0
        assert item['likes'] == 0
        assert item['comments'] == 0
        assert item['view_count'] == 0

    def test_intelligence_directory_permissions(self, aggregator, temp_data_dir):
        """Test intelligence directory creation with proper permissions"""

        # Remove intelligence directory to test recreation
        intelligence_dir = temp_data_dir / "intelligence"
        if intelligence_dir.exists():
            import shutil
            shutil.rmtree(intelligence_dir)

        # Re-initialize aggregator
        new_aggregator = IntelligenceAggregator(temp_data_dir)

        assert new_aggregator.intelligence_dir.exists()
        assert (new_aggregator.intelligence_dir / "daily").exists()


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--cov=src.content_analysis.intelligence_aggregator", "--cov-report=term-missing"])