hvac-kia-content/capture_tiktok_backlog.py

#!/usr/bin/env python3
"""
Capture TikTok backlog with captions
"""
from src.base_scraper import ScraperConfig
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
from pathlib import Path
import time

print('Starting TikTok backlog capture with captions...')
print('='*60)

config = ScraperConfig(
    source_name='tiktok',
    brand_name='hvacknowitall',
    data_dir=Path('test_data/backlog_with_captions'),
    logs_dir=Path('test_logs/backlog_with_captions'),
    timezone='America/Halifax'
)

scraper = TikTokScraperAdvanced(config)

# Clear state for full backlog
if scraper.state_file.exists():
    scraper.state_file.unlink()
    print('Cleared state for full backlog capture')

print('Fetching videos with captions for first 5 videos...')
print('Note: This will take approximately 2-3 minutes')
start = time.time()

# Fetch 35 videos with captions for first 5
items = scraper.fetch_content(
    max_posts=35,
    fetch_captions=True,
    max_caption_fetches=5  # Get captions for 5 videos
)

elapsed = time.time() - start
print(f'\n✅ Fetched {len(items)} videos in {elapsed:.1f} seconds')

# Count how many have captions
no_caption_msg = '(No caption available - fetch individual video for details)'
with_captions = sum(1 for item in items if item.get('caption') and item['caption'] != no_caption_msg)
print(f'✅ Videos with captions: {with_captions}/{len(items)}')

# Save markdown
markdown = scraper.format_markdown(items)
output_file = Path('test_data/backlog_with_captions/tiktok_full.md')
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(markdown, encoding='utf-8')
print(f'✅ Saved to {output_file}')

# Show statistics
total_views = sum(item.get('views', 0) for item in items)
print(f'\n📊 Statistics:')
print(f'  Total videos: {len(items)}')
print(f'  Total views: {total_views:,}')
print(f'  Videos with captions: {with_captions}')
print(f'  Videos with likes data: {sum(1 for item in items if item.get("likes"))}')
print(f'  Videos with comments data: {sum(1 for item in items if item.get("comments"))}')

# Show sample of captions
print('\n📝 Sample captions retrieved:')
print('-'*60)
count = 0
for i, item in enumerate(items):
    caption = item.get('caption', '')
    if caption and caption != no_caption_msg:
        caption_preview = caption[:80] + '...' if len(caption) > 80 else caption
        views = item.get('views', 0)
        likes = item.get('likes', 0)
        print(f'{i+1}. Views: {views:,} | Likes: {likes:,}')
        print(f'   Caption: {caption_preview}')
        count += 1
        if count >= 5:
            break

print('\n✅ Backlog capture complete!')