Major improvements: - Add CumulativeMarkdownManager for intelligent content merging - Implement YouTube Data API v3 integration with caption support - Add MailChimp API integration with content cleaning - Create single source-of-truth files that grow with updates - Smart merging: updates existing entries with better data - Properly combines backlog + incremental daily updates Features: - 179/444 YouTube videos now have captions (40.3%) - MailChimp content cleaned of headers/footers - All sources consolidated to single files - Archive management with timestamped versions - Test suite and documentation included 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
236 lines
No EOL
7.4 KiB
Python
236 lines
No EOL
7.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test the cumulative markdown functionality
|
|
Demonstrates how backlog + incremental updates work together
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.cumulative_markdown_manager import CumulativeMarkdownManager
|
|
from src.base_scraper import ScraperConfig
|
|
import logging
|
|
from datetime import datetime
|
|
import pytz
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger('cumulative_test')
|
|
|
|
|
|
def create_mock_items(start_id: int, count: int, prefix: str = ""):
|
|
"""Create mock content items for testing."""
|
|
items = []
|
|
for i in range(count):
|
|
item_id = f"video_{start_id + i}"
|
|
items.append({
|
|
'id': item_id,
|
|
'title': f"{prefix}Video Title {start_id + i}",
|
|
'views': 1000 * (start_id + i),
|
|
'likes': 100 * (start_id + i),
|
|
'description': f"Description for video {start_id + i}",
|
|
'publish_date': '2024-01-15'
|
|
})
|
|
return items
|
|
|
|
|
|
def format_mock_markdown(items):
|
|
"""Format mock items as markdown."""
|
|
sections = []
|
|
for item in items:
|
|
section = [
|
|
f"# ID: {item['id']}",
|
|
"",
|
|
f"## Title: {item['title']}",
|
|
"",
|
|
f"## Views: {item['views']:,}",
|
|
"",
|
|
f"## Likes: {item['likes']:,}",
|
|
"",
|
|
f"## Description:",
|
|
item['description'],
|
|
"",
|
|
f"## Publish Date: {item['publish_date']}",
|
|
"",
|
|
"-" * 50
|
|
]
|
|
sections.append('\n'.join(section))
|
|
|
|
return '\n\n'.join(sections)
|
|
|
|
|
|
def test_cumulative_workflow():
|
|
"""Test the complete cumulative workflow."""
|
|
logger.info("=" * 60)
|
|
logger.info("TESTING CUMULATIVE MARKDOWN WORKFLOW")
|
|
logger.info("=" * 60)
|
|
|
|
# Setup test config
|
|
config = ScraperConfig(
|
|
source_name='TestSource',
|
|
brand_name='testbrand',
|
|
data_dir=Path('test_data'),
|
|
logs_dir=Path('test_logs'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
# Clean up any existing test files
|
|
test_pattern = "testbrand_TestSource_*.md"
|
|
for old_file in Path('test_data/markdown_current').glob(test_pattern):
|
|
old_file.unlink()
|
|
logger.info(f"Cleaned up old test file: {old_file.name}")
|
|
|
|
# Initialize manager
|
|
manager = CumulativeMarkdownManager(config, logger)
|
|
|
|
# STEP 1: Initial backlog capture
|
|
logger.info("\n" + "=" * 40)
|
|
logger.info("STEP 1: BACKLOG CAPTURE (Day 1)")
|
|
logger.info("=" * 40)
|
|
|
|
backlog_items = create_mock_items(1, 5, "Backlog ")
|
|
logger.info(f"Created {len(backlog_items)} backlog items")
|
|
|
|
file1 = manager.save_cumulative(backlog_items, format_mock_markdown)
|
|
logger.info(f"Saved backlog to: {file1.name}")
|
|
|
|
stats = manager.get_statistics(file1)
|
|
logger.info(f"Stats after backlog: {stats}")
|
|
|
|
# STEP 2: First incremental update (new items)
|
|
logger.info("\n" + "=" * 40)
|
|
logger.info("STEP 2: INCREMENTAL UPDATE - New Items (Day 2)")
|
|
logger.info("=" * 40)
|
|
|
|
new_items = create_mock_items(6, 2, "New ")
|
|
logger.info(f"Created {len(new_items)} new items")
|
|
|
|
file2 = manager.save_cumulative(new_items, format_mock_markdown)
|
|
logger.info(f"Saved incremental to: {file2.name}")
|
|
|
|
stats = manager.get_statistics(file2)
|
|
logger.info(f"Stats after first incremental: {stats}")
|
|
|
|
# Verify content
|
|
content = file2.read_text(encoding='utf-8')
|
|
id_count = content.count('# ID:')
|
|
logger.info(f"Total sections in file: {id_count}")
|
|
|
|
# STEP 3: Second incremental with updates
|
|
logger.info("\n" + "=" * 40)
|
|
logger.info("STEP 3: INCREMENTAL UPDATE - With Updates (Day 3)")
|
|
logger.info("=" * 40)
|
|
|
|
# Create items with updates (higher view counts) and new items
|
|
updated_items = [
|
|
{
|
|
'id': 'video_1', # Update existing
|
|
'title': 'Backlog Video Title 1',
|
|
'views': 5000, # Increased from 1000
|
|
'likes': 500, # Increased from 100
|
|
'description': 'Updated description with more details and captions',
|
|
'publish_date': '2024-01-15',
|
|
'caption': 'This video now has captions!' # New field
|
|
},
|
|
{
|
|
'id': 'video_8', # New item
|
|
'title': 'Brand New Video 8',
|
|
'views': 8000,
|
|
'likes': 800,
|
|
'description': 'Newest video just published',
|
|
'publish_date': '2024-01-18'
|
|
}
|
|
]
|
|
|
|
# Format with caption support
|
|
def format_with_captions(items):
|
|
sections = []
|
|
for item in items:
|
|
section = [
|
|
f"# ID: {item['id']}",
|
|
"",
|
|
f"## Title: {item['title']}",
|
|
"",
|
|
f"## Views: {item['views']:,}",
|
|
"",
|
|
f"## Likes: {item['likes']:,}",
|
|
"",
|
|
f"## Description:",
|
|
item['description'],
|
|
""
|
|
]
|
|
|
|
if 'caption' in item:
|
|
section.extend([
|
|
"## Caption Status:",
|
|
item['caption'],
|
|
""
|
|
])
|
|
|
|
section.extend([
|
|
f"## Publish Date: {item['publish_date']}",
|
|
"",
|
|
"-" * 50
|
|
])
|
|
|
|
sections.append('\n'.join(section))
|
|
|
|
return '\n\n'.join(sections)
|
|
|
|
logger.info(f"Created 1 update + 1 new item")
|
|
|
|
file3 = manager.save_cumulative(updated_items, format_with_captions)
|
|
logger.info(f"Saved second incremental to: {file3.name}")
|
|
|
|
stats = manager.get_statistics(file3)
|
|
logger.info(f"Stats after second incremental: {stats}")
|
|
|
|
# Verify final content
|
|
final_content = file3.read_text(encoding='utf-8')
|
|
final_id_count = final_content.count('# ID:')
|
|
caption_count = final_content.count('## Caption Status:')
|
|
|
|
logger.info(f"Final total sections: {final_id_count}")
|
|
logger.info(f"Sections with captions: {caption_count}")
|
|
|
|
# Check if video_1 was updated
|
|
if 'This video now has captions!' in final_content:
|
|
logger.info("✅ Successfully updated video_1 with captions")
|
|
else:
|
|
logger.error("❌ Failed to update video_1")
|
|
|
|
# Check if video_8 was added
|
|
if 'video_8' in final_content:
|
|
logger.info("✅ Successfully added new video_8")
|
|
else:
|
|
logger.error("❌ Failed to add video_8")
|
|
|
|
# List archive files
|
|
logger.info("\n" + "=" * 40)
|
|
logger.info("ARCHIVED FILES:")
|
|
logger.info("=" * 40)
|
|
|
|
archive_dir = Path('test_data/markdown_archives/TestSource')
|
|
if archive_dir.exists():
|
|
archives = list(archive_dir.glob("*.md"))
|
|
for archive in sorted(archives):
|
|
logger.info(f" - {archive.name}")
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("TEST COMPLETE!")
|
|
logger.info("=" * 60)
|
|
logger.info("Summary:")
|
|
logger.info(f" - Started with 5 backlog items")
|
|
logger.info(f" - Added 2 new items in first incremental")
|
|
logger.info(f" - Updated 1 item + added 1 item in second incremental")
|
|
logger.info(f" - Final file has {final_id_count} total items")
|
|
logger.info(f" - {caption_count} items have captions")
|
|
logger.info(f" - {len(archives) if archive_dir.exists() else 0} versions archived")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_cumulative_workflow() |