hvac-kia-content/tests/test_end_to_end.py

#!/usr/bin/env python3
"""
End-to-end tests with mock data for full workflow validation
"""

import pytest
import json
import time
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import requests

# Add project to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.orchestrator import ContentOrchestrator
from src.base_scraper import BaseScraper, ScraperConfig
from src.rss_scraper import BaseRSSScraper


class MockEndToEndScraper(BaseScraper):
    """End-to-end mock scraper with realistic data"""

    def __init__(self, config: ScraperConfig, mock_data: list):
        super().__init__(config)
        self.mock_data = mock_data

    def fetch_content(self):
        return self.mock_data

    def get_incremental_items(self, items, state):
        if not state.get('last_id'):
            return items

        # Find items after last_id
        last_seen = False
        new_items = []
        for item in items:
            if last_seen:
                new_items.append(item)
            elif item['id'] == state['last_id']:
                last_seen = True
        return new_items

    def update_state(self, state, items):
        if items:
            state['last_id'] = items[-1]['id']
        return state


class TestEndToEnd:
    """End-to-end workflow tests"""

    @pytest.fixture
    def temp_config(self):
        """Create temporary config for testing"""
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            config = ScraperConfig(
                source_name="e2e_test",
                brand_name="hvacknowitall",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            yield config

    @pytest.fixture
    def mock_wordpress_data(self):
        """Mock WordPress blog post data"""
        return [
            {
                'id': 'wp_1001',
                'title': 'Understanding HVAC System Efficiency',
                'url': 'https://hvacknowitall.com/hvac-efficiency/',
                'description': 'A comprehensive guide to improving your HVAC system efficiency and reducing energy costs.',
                'author': 'HVAC Expert',
                'publish_date': '2024-01-15T10:30:00',
                'word_count': 1250,
                'tags': ['efficiency', 'energy-saving', 'maintenance'],
                'categories': ['HVAC Tips', 'Energy Efficiency']
            },
            {
                'id': 'wp_1002',
                'title': 'Common HVAC Problems in Winter',
                'url': 'https://hvacknowitall.com/winter-hvac-problems/',
                'description': 'Identify and troubleshoot the most common HVAC issues during cold weather.',
                'author': 'HVAC Expert',
                'publish_date': '2024-01-20T14:15:00',
                'word_count': 980,
                'tags': ['troubleshooting', 'winter', 'maintenance'],
                'categories': ['Troubleshooting', 'Seasonal Tips']
            }
        ]

    @pytest.fixture
    def mock_youtube_data(self):
        """Mock YouTube video data"""
        return [
            {
                'id': 'yt_abc123',
                'title': 'How to Replace Your Air Filter - DIY HVAC',
                'url': 'https://youtube.com/watch?v=abc123',
                'description': 'Step-by-step guide to replacing your HVAC air filter. Save money and improve air quality!',
                'author': 'HVAC Know It All',
                'type': 'video',
                'views': 15420,
                'likes': 247,
                'comments': 18,
                'shares': 12,
                'duration': '8:45',
                'tags': ['DIY', 'air filter', 'maintenance']
            },
            {
                'id': 'yt_def456',
                'title': 'HVAC Short: Quick Thermostat Tip',
                'url': 'https://youtube.com/shorts/def456',
                'description': 'Quick tip for optimizing your thermostat settings.',
                'author': 'HVAC Know It All',
                'type': 'short',
                'views': 8934,
                'likes': 156,
                'comments': 7,
                'shares': 23,
                'duration': '0:58',
                'tags': ['thermostat', 'tips', 'energy-saving']
            }
        ]

    @pytest.fixture
    def mock_podcast_data(self):
        """Mock podcast episode data"""
        return [
            {
                'id': 'pod_ep101',
                'title': 'Episode 101: Heat Pump vs Furnace Debate',
                'url': 'https://hvacknowitall.com/podcast/ep101/',
                'description': 'We dive deep into the pros and cons of heat pumps versus traditional furnaces.',
                'author': 'HVAC Know It All Podcast',
                'audio_link': 'https://hvacknowitall.com/podcast/ep101.mp3',
                'duration': '45:32',
                'publish_date': '2024-01-18T09:00:00',
                'image': 'https://hvacknowitall.com/podcast/ep101-cover.jpg',
                'tags': ['heat pump', 'furnace', 'comparison']
            }
        ]

    @pytest.fixture
    def orchestrator_with_mock_data(self, temp_config, mock_wordpress_data, mock_youtube_data, mock_podcast_data):
        """Create orchestrator with realistic mock data"""
        orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir)

        # Replace scrapers with mock versions
        orchestrator.scrapers = {
            'wordpress': MockEndToEndScraper(
                ScraperConfig(
                    source_name="wordpress",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                mock_wordpress_data
            ),
            'youtube': MockEndToEndScraper(
                ScraperConfig(
                    source_name="youtube",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                mock_youtube_data
            ),
            'podcast': MockEndToEndScraper(
                ScraperConfig(
                    source_name="podcast",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                mock_podcast_data
            )
        }

        return orchestrator

    def test_full_workflow_execution(self, orchestrator_with_mock_data):
        """Test complete workflow from start to finish"""
        orchestrator = orchestrator_with_mock_data

        # Run full workflow
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Verify markdown files were created
        markdown_dir = orchestrator.config.data_dir / "markdown_current"
        markdown_files = list(markdown_dir.glob("*.md"))
        assert len(markdown_files) == 3

        # Verify file naming convention
        for file_path in markdown_files:
            filename = file_path.name
            assert filename.startswith("hvacknowitall_")
            assert any(source in filename for source in ['wordpress', 'youtube', 'podcast'])
            assert ".md" in filename
            # Check timestamp format (YYYY-DD-MM-THHMMSS)
            assert len(filename.split('_')) >= 3

    def test_markdown_format_compliance(self, orchestrator_with_mock_data):
        """Test that generated markdown follows specification exactly"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Check each markdown file
        markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))

        for file_path in markdown_files:
            content = file_path.read_text(encoding='utf-8')

            # Verify spec format for each item
            assert "# ID:" in content
            assert "## Title:" in content
            assert "## Type:" in content
            assert "## Permalink:" in content
            assert "## Description:" in content
            assert "## Metadata:" in content
            assert "### Comments:" in content
            assert "### Likes:" in content
            assert "### Tags:" in content

            # Verify separator between items
            if content.count("# ID:") > 1:
                assert "--------------" in content

            # Verify specific content based on source
            if "wordpress" in file_path.name:
                assert "Understanding HVAC System Efficiency" in content
                assert "energy-saving" in content
                assert "1250" in content  # word count should be preserved

            elif "youtube" in file_path.name:
                assert "How to Replace Your Air Filter" in content
                assert "15420" in content  # view count
                assert "247" in content   # like count

            elif "podcast" in file_path.name:
                assert "Heat Pump vs Furnace Debate" in content
                assert "45:32" in content  # duration

    def test_directory_structure_creation(self, orchestrator_with_mock_data):
        """Test that proper directory structure is created"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        base_dir = orchestrator.config.data_dir
        logs_dir = orchestrator.config.logs_dir

        # Check main directories
        assert (base_dir / "markdown_current").exists()
        assert (base_dir / "markdown_archives").exists()
        assert (base_dir / "media").exists()
        assert (base_dir / ".state").exists()

        # Check source-specific directories
        sources = ['Wordpress', 'Youtube', 'Podcast']
        for source in sources:
            assert (base_dir / "markdown_archives" / source).exists()
            assert (base_dir / "media" / source).exists()
            assert (logs_dir / source).exists()

    def test_state_persistence_workflow(self, orchestrator_with_mock_data):
        """Test incremental updates with state persistence"""
        orchestrator = orchestrator_with_mock_data

        # First run - should process all items
        orchestrator.run_all_scrapers(parallel=False)

        # Check state files were created
        state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json"))
        assert len(state_files) == 3

        # Verify state content
        wordpress_state_file = orchestrator.config.data_dir / ".state" / "wordpress_state.json"
        assert wordpress_state_file.exists()

        state_data = json.loads(wordpress_state_file.read_text())
        assert 'last_id' in state_data
        assert 'last_update' in state_data
        assert 'last_item_count' in state_data
        assert state_data['last_id'] == 'wp_1002'  # Last item ID
        assert state_data['last_item_count'] == 2  # Both items processed

        # Add new item to WordPress scraper
        new_item = {
            'id': 'wp_1003',
            'title': 'New HVAC Article',
            'url': 'https://hvacknowitall.com/new-article/',
            'description': 'Brand new article about HVAC.',
            'author': 'HVAC Expert',
            'tags': ['new'],
            'categories': ['News']
        }
        orchestrator.scrapers['wordpress'].mock_data.append(new_item)

        # Archive existing files to simulate next run
        for scraper in orchestrator.scrapers.values():
            scraper.archive_current_file()

        # Second run - should only process new item
        orchestrator.run_all_scrapers(parallel=False)

        # Check that only incremental content was processed
        updated_state = json.loads(wordpress_state_file.read_text())
        assert updated_state['last_id'] == 'wp_1003'
        assert updated_state['last_item_count'] == 1  # Only new item

        # Verify new markdown contains only new item
        new_markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*wordpress*.md"))
        assert len(new_markdown_files) == 1

        new_content = new_markdown_files[0].read_text()
        assert "New HVAC Article" in new_content
        assert "Understanding HVAC System Efficiency" not in new_content  # Old content not repeated

    @patch('src.orchestrator.ContentOrchestrator.sync_to_nas')
    def test_nas_sync_integration(self, mock_sync, orchestrator_with_mock_data):
        """Test NAS sync is called with correct parameters"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Verify sync was called
        mock_sync.assert_called_once()

    def test_error_recovery_workflow(self, orchestrator_with_mock_data):
        """Test that workflow continues when one source fails"""
        orchestrator = orchestrator_with_mock_data

        # Make YouTube scraper fail
        def failing_run():
            raise Exception("YouTube API error")

        orchestrator.scrapers['youtube'].run = failing_run

        # Run workflow - should not crash
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Verify other sources still completed
        markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))

        # Should have 2 files (WordPress and Podcast), not 3
        assert len(markdown_files) == 2

        source_names = [f.name for f in markdown_files]
        assert any('wordpress' in name for name in source_names)
        assert any('podcast' in name for name in source_names)
        assert not any('youtube' in name for name in source_names)

    def test_logging_integration(self, orchestrator_with_mock_data):
        """Test that comprehensive logging works throughout workflow"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Check log files exist for each source
        sources = ['wordpress', 'youtube', 'podcast']
        for source in sources:
            log_file = orchestrator.config.logs_dir / source.title() / f"{source}.log"
            assert log_file.exists()

            log_content = log_file.read_text()
            assert f"Starting {source} scraper" in log_content
            assert "Successfully processed" in log_content
            assert "Saved markdown to" in log_content

    @patch('requests.Session.request')
    def test_media_download_workflow(self, mock_request, orchestrator_with_mock_data):
        """Test media downloading integration"""
        # Mock successful media download
        mock_response = Mock()
        mock_response.status_code = 200
        mock_response.iter_content.return_value = [b'fake image data']
        mock_request.return_value = mock_response

        # Add media URLs to mock data
        orchestrator = orchestrator_with_mock_data
        podcast_scraper = orchestrator.scrapers['podcast']
        podcast_scraper.mock_data[0]['image'] = 'https://example.com/podcast-cover.jpg'

        # Override download_media to actually download
        original_run = podcast_scraper.run
        def run_with_media():
            original_run()
            # Simulate media download
            for item in podcast_scraper.mock_data:
                if 'image' in item:
                    podcast_scraper.download_media(item['image'], item['id'], 'image')

        podcast_scraper.run = run_with_media

        orchestrator.run_all_scrapers(parallel=False)

        # Verify media directory and file
        media_dir = orchestrator.config.data_dir / "media" / "Podcast"
        assert media_dir.exists()

        # Check if media file was created
        media_files = list(media_dir.glob("*"))
        if media_files:  # Media download attempted
            assert len(media_files) > 0

    def test_archive_workflow(self, orchestrator_with_mock_data):
        """Test that archiving works correctly in full workflow"""
        orchestrator = orchestrator_with_mock_data

        # Create some existing files to archive
        current_dir = orchestrator.config.data_dir / "markdown_current"
        current_dir.mkdir(parents=True, exist_ok=True)

        old_file = current_dir / "hvacknowitall_wordpress_2024-01-01-T120000.md"
        old_file.write_text("Old content")

        # Run workflow
        orchestrator.run_all_scrapers(parallel=False)

        # Check that old file was archived
        archive_dir = orchestrator.config.data_dir / "markdown_archives" / "Wordpress"
        archived_files = list(archive_dir.glob("*.md"))

        # Should have archived the old file
        assert any("2024-01-01" in f.name for f in archived_files)

        # Current directory should have new files
        current_files = list(current_dir.glob("*.md"))
        assert len(current_files) == 3  # One per source
        assert not any("2024-01-01" in f.name for f in current_files)


if __name__ == '__main__':
    pytest.main([__file__, '-v'])