hvac-kia-content/test_cumulative_mode.py

#!/usr/bin/env python3
"""
Test the cumulative markdown functionality
Demonstrates how backlog + incremental updates work together
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.cumulative_markdown_manager import CumulativeMarkdownManager
from src.base_scraper import ScraperConfig
import logging
from datetime import datetime
import pytz

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('cumulative_test')


def create_mock_items(start_id: int, count: int, prefix: str = ""):
    """Create mock content items for testing."""
    items = []
    for i in range(count):
        item_id = f"video_{start_id + i}"
        items.append({
            'id': item_id,
            'title': f"{prefix}Video Title {start_id + i}",
            'views': 1000 * (start_id + i),
            'likes': 100 * (start_id + i),
            'description': f"Description for video {start_id + i}",
            'publish_date': '2024-01-15'
        })
    return items


def format_mock_markdown(items):
    """Format mock items as markdown."""
    sections = []
    for item in items:
        section = [
            f"# ID: {item['id']}",
            "",
            f"## Title: {item['title']}",
            "",
            f"## Views: {item['views']:,}",
            "",
            f"## Likes: {item['likes']:,}",
            "",
            f"## Description:",
            item['description'],
            "",
            f"## Publish Date: {item['publish_date']}",
            "",
            "-" * 50
        ]
        sections.append('\n'.join(section))

    return '\n\n'.join(sections)


def test_cumulative_workflow():
    """Test the complete cumulative workflow."""
    logger.info("=" * 60)
    logger.info("TESTING CUMULATIVE MARKDOWN WORKFLOW")
    logger.info("=" * 60)

    # Setup test config
    config = ScraperConfig(
        source_name='TestSource',
        brand_name='testbrand',
        data_dir=Path('test_data'),
        logs_dir=Path('test_logs'),
        timezone='America/Halifax'
    )

    # Clean up any existing test files
    test_pattern = "testbrand_TestSource_*.md"
    for old_file in Path('test_data/markdown_current').glob(test_pattern):
        old_file.unlink()
        logger.info(f"Cleaned up old test file: {old_file.name}")

    # Initialize manager
    manager = CumulativeMarkdownManager(config, logger)

    # STEP 1: Initial backlog capture
    logger.info("\n" + "=" * 40)
    logger.info("STEP 1: BACKLOG CAPTURE (Day 1)")
    logger.info("=" * 40)

    backlog_items = create_mock_items(1, 5, "Backlog ")
    logger.info(f"Created {len(backlog_items)} backlog items")

    file1 = manager.save_cumulative(backlog_items, format_mock_markdown)
    logger.info(f"Saved backlog to: {file1.name}")

    stats = manager.get_statistics(file1)
    logger.info(f"Stats after backlog: {stats}")

    # STEP 2: First incremental update (new items)
    logger.info("\n" + "=" * 40)
    logger.info("STEP 2: INCREMENTAL UPDATE - New Items (Day 2)")
    logger.info("=" * 40)

    new_items = create_mock_items(6, 2, "New ")
    logger.info(f"Created {len(new_items)} new items")

    file2 = manager.save_cumulative(new_items, format_mock_markdown)
    logger.info(f"Saved incremental to: {file2.name}")

    stats = manager.get_statistics(file2)
    logger.info(f"Stats after first incremental: {stats}")

    # Verify content
    content = file2.read_text(encoding='utf-8')
    id_count = content.count('# ID:')
    logger.info(f"Total sections in file: {id_count}")

    # STEP 3: Second incremental with updates
    logger.info("\n" + "=" * 40)
    logger.info("STEP 3: INCREMENTAL UPDATE - With Updates (Day 3)")
    logger.info("=" * 40)

    # Create items with updates (higher view counts) and new items
    updated_items = [
        {
            'id': 'video_1',  # Update existing
            'title': 'Backlog Video Title 1',
            'views': 5000,  # Increased from 1000
            'likes': 500,   # Increased from 100
            'description': 'Updated description with more details and captions',
            'publish_date': '2024-01-15',
            'caption': 'This video now has captions!'  # New field
        },
        {
            'id': 'video_8',  # New item
            'title': 'Brand New Video 8',
            'views': 8000,
            'likes': 800,
            'description': 'Newest video just published',
            'publish_date': '2024-01-18'
        }
    ]

    # Format with caption support
    def format_with_captions(items):
        sections = []
        for item in items:
            section = [
                f"# ID: {item['id']}",
                "",
                f"## Title: {item['title']}",
                "",
                f"## Views: {item['views']:,}",
                "",
                f"## Likes: {item['likes']:,}",
                "",
                f"## Description:",
                item['description'],
                ""
            ]

            if 'caption' in item:
                section.extend([
                    "## Caption Status:",
                    item['caption'],
                    ""
                ])

            section.extend([
                f"## Publish Date: {item['publish_date']}",
                "",
                "-" * 50
            ])

            sections.append('\n'.join(section))

        return '\n\n'.join(sections)

    logger.info(f"Created 1 update + 1 new item")

    file3 = manager.save_cumulative(updated_items, format_with_captions)
    logger.info(f"Saved second incremental to: {file3.name}")

    stats = manager.get_statistics(file3)
    logger.info(f"Stats after second incremental: {stats}")

    # Verify final content
    final_content = file3.read_text(encoding='utf-8')
    final_id_count = final_content.count('# ID:')
    caption_count = final_content.count('## Caption Status:')

    logger.info(f"Final total sections: {final_id_count}")
    logger.info(f"Sections with captions: {caption_count}")

    # Check if video_1 was updated
    if 'This video now has captions!' in final_content:
        logger.info("✅ Successfully updated video_1 with captions")
    else:
        logger.error("❌ Failed to update video_1")

    # Check if video_8 was added
    if 'video_8' in final_content:
        logger.info("✅ Successfully added new video_8")
    else:
        logger.error("❌ Failed to add video_8")

    # List archive files
    logger.info("\n" + "=" * 40)
    logger.info("ARCHIVED FILES:")
    logger.info("=" * 40)

    archive_dir = Path('test_data/markdown_archives/TestSource')
    if archive_dir.exists():
        archives = list(archive_dir.glob("*.md"))
        for archive in sorted(archives):
            logger.info(f"  - {archive.name}")

    logger.info("\n" + "=" * 60)
    logger.info("TEST COMPLETE!")
    logger.info("=" * 60)
    logger.info("Summary:")
    logger.info(f"  - Started with 5 backlog items")
    logger.info(f"  - Added 2 new items in first incremental")
    logger.info(f"  - Updated 1 item + added 1 item in second incremental")
    logger.info(f"  - Final file has {final_id_count} total items")
    logger.info(f"  - {caption_count} items have captions")
    logger.info(f"  - {len(archives) if archive_dir.exists() else 0} versions archived")


if __name__ == "__main__":
    test_cumulative_workflow()