#!/usr/bin/env python3 """ Test the cumulative markdown functionality Demonstrates how backlog + incremental updates work together """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.cumulative_markdown_manager import CumulativeMarkdownManager from src.base_scraper import ScraperConfig import logging from datetime import datetime import pytz # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger('cumulative_test') def create_mock_items(start_id: int, count: int, prefix: str = ""): """Create mock content items for testing.""" items = [] for i in range(count): item_id = f"video_{start_id + i}" items.append({ 'id': item_id, 'title': f"{prefix}Video Title {start_id + i}", 'views': 1000 * (start_id + i), 'likes': 100 * (start_id + i), 'description': f"Description for video {start_id + i}", 'publish_date': '2024-01-15' }) return items def format_mock_markdown(items): """Format mock items as markdown.""" sections = [] for item in items: section = [ f"# ID: {item['id']}", "", f"## Title: {item['title']}", "", f"## Views: {item['views']:,}", "", f"## Likes: {item['likes']:,}", "", f"## Description:", item['description'], "", f"## Publish Date: {item['publish_date']}", "", "-" * 50 ] sections.append('\n'.join(section)) return '\n\n'.join(sections) def test_cumulative_workflow(): """Test the complete cumulative workflow.""" logger.info("=" * 60) logger.info("TESTING CUMULATIVE MARKDOWN WORKFLOW") logger.info("=" * 60) # Setup test config config = ScraperConfig( source_name='TestSource', brand_name='testbrand', data_dir=Path('test_data'), logs_dir=Path('test_logs'), timezone='America/Halifax' ) # Clean up any existing test files test_pattern = "testbrand_TestSource_*.md" for old_file in Path('test_data/markdown_current').glob(test_pattern): old_file.unlink() logger.info(f"Cleaned up old test file: {old_file.name}") # Initialize manager manager = CumulativeMarkdownManager(config, logger) # STEP 1: Initial backlog capture logger.info("\n" + "=" * 40) logger.info("STEP 1: BACKLOG CAPTURE (Day 1)") logger.info("=" * 40) backlog_items = create_mock_items(1, 5, "Backlog ") logger.info(f"Created {len(backlog_items)} backlog items") file1 = manager.save_cumulative(backlog_items, format_mock_markdown) logger.info(f"Saved backlog to: {file1.name}") stats = manager.get_statistics(file1) logger.info(f"Stats after backlog: {stats}") # STEP 2: First incremental update (new items) logger.info("\n" + "=" * 40) logger.info("STEP 2: INCREMENTAL UPDATE - New Items (Day 2)") logger.info("=" * 40) new_items = create_mock_items(6, 2, "New ") logger.info(f"Created {len(new_items)} new items") file2 = manager.save_cumulative(new_items, format_mock_markdown) logger.info(f"Saved incremental to: {file2.name}") stats = manager.get_statistics(file2) logger.info(f"Stats after first incremental: {stats}") # Verify content content = file2.read_text(encoding='utf-8') id_count = content.count('# ID:') logger.info(f"Total sections in file: {id_count}") # STEP 3: Second incremental with updates logger.info("\n" + "=" * 40) logger.info("STEP 3: INCREMENTAL UPDATE - With Updates (Day 3)") logger.info("=" * 40) # Create items with updates (higher view counts) and new items updated_items = [ { 'id': 'video_1', # Update existing 'title': 'Backlog Video Title 1', 'views': 5000, # Increased from 1000 'likes': 500, # Increased from 100 'description': 'Updated description with more details and captions', 'publish_date': '2024-01-15', 'caption': 'This video now has captions!' # New field }, { 'id': 'video_8', # New item 'title': 'Brand New Video 8', 'views': 8000, 'likes': 800, 'description': 'Newest video just published', 'publish_date': '2024-01-18' } ] # Format with caption support def format_with_captions(items): sections = [] for item in items: section = [ f"# ID: {item['id']}", "", f"## Title: {item['title']}", "", f"## Views: {item['views']:,}", "", f"## Likes: {item['likes']:,}", "", f"## Description:", item['description'], "" ] if 'caption' in item: section.extend([ "## Caption Status:", item['caption'], "" ]) section.extend([ f"## Publish Date: {item['publish_date']}", "", "-" * 50 ]) sections.append('\n'.join(section)) return '\n\n'.join(sections) logger.info(f"Created 1 update + 1 new item") file3 = manager.save_cumulative(updated_items, format_with_captions) logger.info(f"Saved second incremental to: {file3.name}") stats = manager.get_statistics(file3) logger.info(f"Stats after second incremental: {stats}") # Verify final content final_content = file3.read_text(encoding='utf-8') final_id_count = final_content.count('# ID:') caption_count = final_content.count('## Caption Status:') logger.info(f"Final total sections: {final_id_count}") logger.info(f"Sections with captions: {caption_count}") # Check if video_1 was updated if 'This video now has captions!' in final_content: logger.info("✅ Successfully updated video_1 with captions") else: logger.error("❌ Failed to update video_1") # Check if video_8 was added if 'video_8' in final_content: logger.info("✅ Successfully added new video_8") else: logger.error("❌ Failed to add video_8") # List archive files logger.info("\n" + "=" * 40) logger.info("ARCHIVED FILES:") logger.info("=" * 40) archive_dir = Path('test_data/markdown_archives/TestSource') if archive_dir.exists(): archives = list(archive_dir.glob("*.md")) for archive in sorted(archives): logger.info(f" - {archive.name}") logger.info("\n" + "=" * 60) logger.info("TEST COMPLETE!") logger.info("=" * 60) logger.info("Summary:") logger.info(f" - Started with 5 backlog items") logger.info(f" - Added 2 new items in first incremental") logger.info(f" - Updated 1 item + added 1 item in second incremental") logger.info(f" - Final file has {final_id_count} total items") logger.info(f" - {caption_count} items have captions") logger.info(f" - {len(archives) if archive_dir.exists() else 0} versions archived") if __name__ == "__main__": test_cumulative_workflow()