#!/usr/bin/env python3 """ Capture TikTok backlog with captions """ from src.base_scraper import ScraperConfig from src.tiktok_scraper_advanced import TikTokScraperAdvanced from pathlib import Path import time print('Starting TikTok backlog capture with captions...') print('='*60) config = ScraperConfig( source_name='tiktok', brand_name='hvacknowitall', data_dir=Path('test_data/backlog_with_captions'), logs_dir=Path('test_logs/backlog_with_captions'), timezone='America/Halifax' ) scraper = TikTokScraperAdvanced(config) # Clear state for full backlog if scraper.state_file.exists(): scraper.state_file.unlink() print('Cleared state for full backlog capture') print('Fetching videos with captions for first 5 videos...') print('Note: This will take approximately 2-3 minutes') start = time.time() # Fetch 35 videos with captions for first 5 items = scraper.fetch_content( max_posts=35, fetch_captions=True, max_caption_fetches=5 # Get captions for 5 videos ) elapsed = time.time() - start print(f'\nāœ… Fetched {len(items)} videos in {elapsed:.1f} seconds') # Count how many have captions no_caption_msg = '(No caption available - fetch individual video for details)' with_captions = sum(1 for item in items if item.get('caption') and item['caption'] != no_caption_msg) print(f'āœ… Videos with captions: {with_captions}/{len(items)}') # Save markdown markdown = scraper.format_markdown(items) output_file = Path('test_data/backlog_with_captions/tiktok_full.md') output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(markdown, encoding='utf-8') print(f'āœ… Saved to {output_file}') # Show statistics total_views = sum(item.get('views', 0) for item in items) print(f'\nšŸ“Š Statistics:') print(f' Total videos: {len(items)}') print(f' Total views: {total_views:,}') print(f' Videos with captions: {with_captions}') print(f' Videos with likes data: {sum(1 for item in items if item.get("likes"))}') print(f' Videos with comments data: {sum(1 for item in items if item.get("comments"))}') # Show sample of captions print('\nšŸ“ Sample captions retrieved:') print('-'*60) count = 0 for i, item in enumerate(items): caption = item.get('caption', '') if caption and caption != no_caption_msg: caption_preview = caption[:80] + '...' if len(caption) > 80 else caption views = item.get('views', 0) likes = item.get('likes', 0) print(f'{i+1}. Views: {views:,} | Likes: {likes:,}') print(f' Caption: {caption_preview}') count += 1 if count >= 5: break print('\nāœ… Backlog capture complete!')