hvac-kia-content/test_cumulative_mode.py
Ben Reed 8ceb858026 Implement cumulative markdown system and API integrations
Major improvements:
- Add CumulativeMarkdownManager for intelligent content merging
- Implement YouTube Data API v3 integration with caption support
- Add MailChimp API integration with content cleaning
- Create single source-of-truth files that grow with updates
- Smart merging: updates existing entries with better data
- Properly combines backlog + incremental daily updates

Features:
- 179/444 YouTube videos now have captions (40.3%)
- MailChimp content cleaned of headers/footers
- All sources consolidated to single files
- Archive management with timestamped versions
- Test suite and documentation included

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 10:53:40 -03:00

236 lines
No EOL
7.4 KiB
Python

#!/usr/bin/env python3
"""
Test the cumulative markdown functionality
Demonstrates how backlog + incremental updates work together
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.cumulative_markdown_manager import CumulativeMarkdownManager
from src.base_scraper import ScraperConfig
import logging
from datetime import datetime
import pytz
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('cumulative_test')
def create_mock_items(start_id: int, count: int, prefix: str = ""):
"""Create mock content items for testing."""
items = []
for i in range(count):
item_id = f"video_{start_id + i}"
items.append({
'id': item_id,
'title': f"{prefix}Video Title {start_id + i}",
'views': 1000 * (start_id + i),
'likes': 100 * (start_id + i),
'description': f"Description for video {start_id + i}",
'publish_date': '2024-01-15'
})
return items
def format_mock_markdown(items):
"""Format mock items as markdown."""
sections = []
for item in items:
section = [
f"# ID: {item['id']}",
"",
f"## Title: {item['title']}",
"",
f"## Views: {item['views']:,}",
"",
f"## Likes: {item['likes']:,}",
"",
f"## Description:",
item['description'],
"",
f"## Publish Date: {item['publish_date']}",
"",
"-" * 50
]
sections.append('\n'.join(section))
return '\n\n'.join(sections)
def test_cumulative_workflow():
"""Test the complete cumulative workflow."""
logger.info("=" * 60)
logger.info("TESTING CUMULATIVE MARKDOWN WORKFLOW")
logger.info("=" * 60)
# Setup test config
config = ScraperConfig(
source_name='TestSource',
brand_name='testbrand',
data_dir=Path('test_data'),
logs_dir=Path('test_logs'),
timezone='America/Halifax'
)
# Clean up any existing test files
test_pattern = "testbrand_TestSource_*.md"
for old_file in Path('test_data/markdown_current').glob(test_pattern):
old_file.unlink()
logger.info(f"Cleaned up old test file: {old_file.name}")
# Initialize manager
manager = CumulativeMarkdownManager(config, logger)
# STEP 1: Initial backlog capture
logger.info("\n" + "=" * 40)
logger.info("STEP 1: BACKLOG CAPTURE (Day 1)")
logger.info("=" * 40)
backlog_items = create_mock_items(1, 5, "Backlog ")
logger.info(f"Created {len(backlog_items)} backlog items")
file1 = manager.save_cumulative(backlog_items, format_mock_markdown)
logger.info(f"Saved backlog to: {file1.name}")
stats = manager.get_statistics(file1)
logger.info(f"Stats after backlog: {stats}")
# STEP 2: First incremental update (new items)
logger.info("\n" + "=" * 40)
logger.info("STEP 2: INCREMENTAL UPDATE - New Items (Day 2)")
logger.info("=" * 40)
new_items = create_mock_items(6, 2, "New ")
logger.info(f"Created {len(new_items)} new items")
file2 = manager.save_cumulative(new_items, format_mock_markdown)
logger.info(f"Saved incremental to: {file2.name}")
stats = manager.get_statistics(file2)
logger.info(f"Stats after first incremental: {stats}")
# Verify content
content = file2.read_text(encoding='utf-8')
id_count = content.count('# ID:')
logger.info(f"Total sections in file: {id_count}")
# STEP 3: Second incremental with updates
logger.info("\n" + "=" * 40)
logger.info("STEP 3: INCREMENTAL UPDATE - With Updates (Day 3)")
logger.info("=" * 40)
# Create items with updates (higher view counts) and new items
updated_items = [
{
'id': 'video_1', # Update existing
'title': 'Backlog Video Title 1',
'views': 5000, # Increased from 1000
'likes': 500, # Increased from 100
'description': 'Updated description with more details and captions',
'publish_date': '2024-01-15',
'caption': 'This video now has captions!' # New field
},
{
'id': 'video_8', # New item
'title': 'Brand New Video 8',
'views': 8000,
'likes': 800,
'description': 'Newest video just published',
'publish_date': '2024-01-18'
}
]
# Format with caption support
def format_with_captions(items):
sections = []
for item in items:
section = [
f"# ID: {item['id']}",
"",
f"## Title: {item['title']}",
"",
f"## Views: {item['views']:,}",
"",
f"## Likes: {item['likes']:,}",
"",
f"## Description:",
item['description'],
""
]
if 'caption' in item:
section.extend([
"## Caption Status:",
item['caption'],
""
])
section.extend([
f"## Publish Date: {item['publish_date']}",
"",
"-" * 50
])
sections.append('\n'.join(section))
return '\n\n'.join(sections)
logger.info(f"Created 1 update + 1 new item")
file3 = manager.save_cumulative(updated_items, format_with_captions)
logger.info(f"Saved second incremental to: {file3.name}")
stats = manager.get_statistics(file3)
logger.info(f"Stats after second incremental: {stats}")
# Verify final content
final_content = file3.read_text(encoding='utf-8')
final_id_count = final_content.count('# ID:')
caption_count = final_content.count('## Caption Status:')
logger.info(f"Final total sections: {final_id_count}")
logger.info(f"Sections with captions: {caption_count}")
# Check if video_1 was updated
if 'This video now has captions!' in final_content:
logger.info("✅ Successfully updated video_1 with captions")
else:
logger.error("❌ Failed to update video_1")
# Check if video_8 was added
if 'video_8' in final_content:
logger.info("✅ Successfully added new video_8")
else:
logger.error("❌ Failed to add video_8")
# List archive files
logger.info("\n" + "=" * 40)
logger.info("ARCHIVED FILES:")
logger.info("=" * 40)
archive_dir = Path('test_data/markdown_archives/TestSource')
if archive_dir.exists():
archives = list(archive_dir.glob("*.md"))
for archive in sorted(archives):
logger.info(f" - {archive.name}")
logger.info("\n" + "=" * 60)
logger.info("TEST COMPLETE!")
logger.info("=" * 60)
logger.info("Summary:")
logger.info(f" - Started with 5 backlog items")
logger.info(f" - Added 2 new items in first incremental")
logger.info(f" - Updated 1 item + added 1 item in second incremental")
logger.info(f" - Final file has {final_id_count} total items")
logger.info(f" - {caption_count} items have captions")
logger.info(f" - {len(archives) if archive_dir.exists() else 0} versions archived")
if __name__ == "__main__":
test_cumulative_workflow()