Production Readiness Improvements: - Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM) - Enabled NAS synchronization in production runner with error handling - Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md) - Made systemd services portable (removed hardcoded user/paths) - Added environment variable validation on startup - Moved DISPLAY/XAUTHORITY to .env configuration Systemd Improvements: - Created template service file (@.service) for any user - Changed all paths to /opt/hvac-kia-content - Updated installation script for portable deployment - Fixed service dependencies and resource limits Documentation: - Created comprehensive PRODUCTION_TODO.md with 25 tasks - Added PRODUCTION_GUIDE.md with deployment instructions - Documented spec compliance gaps (65% complete) Remaining work includes retry logic, connection pooling, media downloads, and pytest test suite as documented in PRODUCTION_TODO.md 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
79 lines
No EOL
2.6 KiB
Python
Executable file
79 lines
No EOL
2.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Capture TikTok backlog with captions
|
|
"""
|
|
from src.base_scraper import ScraperConfig
|
|
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
|
|
from pathlib import Path
|
|
import time
|
|
|
|
print('Starting TikTok backlog capture with captions...')
|
|
print('='*60)
|
|
|
|
config = ScraperConfig(
|
|
source_name='tiktok',
|
|
brand_name='hvacknowitall',
|
|
data_dir=Path('test_data/backlog_with_captions'),
|
|
logs_dir=Path('test_logs/backlog_with_captions'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
scraper = TikTokScraperAdvanced(config)
|
|
|
|
# Clear state for full backlog
|
|
if scraper.state_file.exists():
|
|
scraper.state_file.unlink()
|
|
print('Cleared state for full backlog capture')
|
|
|
|
print('Fetching videos with captions for first 5 videos...')
|
|
print('Note: This will take approximately 2-3 minutes')
|
|
start = time.time()
|
|
|
|
# Fetch 35 videos with captions for first 5
|
|
items = scraper.fetch_content(
|
|
max_posts=35,
|
|
fetch_captions=True,
|
|
max_caption_fetches=5 # Get captions for 5 videos
|
|
)
|
|
|
|
elapsed = time.time() - start
|
|
print(f'\n✅ Fetched {len(items)} videos in {elapsed:.1f} seconds')
|
|
|
|
# Count how many have captions
|
|
no_caption_msg = '(No caption available - fetch individual video for details)'
|
|
with_captions = sum(1 for item in items if item.get('caption') and item['caption'] != no_caption_msg)
|
|
print(f'✅ Videos with captions: {with_captions}/{len(items)}')
|
|
|
|
# Save markdown
|
|
markdown = scraper.format_markdown(items)
|
|
output_file = Path('test_data/backlog_with_captions/tiktok_full.md')
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
print(f'✅ Saved to {output_file}')
|
|
|
|
# Show statistics
|
|
total_views = sum(item.get('views', 0) for item in items)
|
|
print(f'\n📊 Statistics:')
|
|
print(f' Total videos: {len(items)}')
|
|
print(f' Total views: {total_views:,}')
|
|
print(f' Videos with captions: {with_captions}')
|
|
print(f' Videos with likes data: {sum(1 for item in items if item.get("likes"))}')
|
|
print(f' Videos with comments data: {sum(1 for item in items if item.get("comments"))}')
|
|
|
|
# Show sample of captions
|
|
print('\n📝 Sample captions retrieved:')
|
|
print('-'*60)
|
|
count = 0
|
|
for i, item in enumerate(items):
|
|
caption = item.get('caption', '')
|
|
if caption and caption != no_caption_msg:
|
|
caption_preview = caption[:80] + '...' if len(caption) > 80 else caption
|
|
views = item.get('views', 0)
|
|
likes = item.get('likes', 0)
|
|
print(f'{i+1}. Views: {views:,} | Likes: {likes:,}')
|
|
print(f' Caption: {caption_preview}')
|
|
count += 1
|
|
if count >= 5:
|
|
break
|
|
|
|
print('\n✅ Backlog capture complete!') |