hvac-kia-content/run_production_cumulative.py
Ben Reed 2edc359b5e feat: Implement comprehensive image downloading and cumulative markdown system
Major Updates:
- Added image downloading for Instagram, YouTube, and Podcast scrapers
- Implemented cumulative markdown system for maintaining single source-of-truth files
- Deployed production services with automatic NAS sync for images
- Standardized file naming conventions per project specification

New Features:
- Instagram: Downloads all post images, carousel images, and video thumbnails
- YouTube: Downloads video thumbnails (highest quality available)
- Podcast: Downloads episode artwork/thumbnails
- Consistent image naming: {source}_{item_id}_{type}.{ext}
- Cumulative markdown updates to prevent file proliferation
- Automatic media sync to NAS at /mnt/nas/hvacknowitall/media/

Production Deployment:
- New systemd services: hvac-content-images-8am and hvac-content-images-12pm
- Runs twice daily at 8 AM and 12 PM Atlantic time
- Comprehensive rsync for both markdown and media files

File Structure Compliance:
- Renamed Instagram backlog to spec-compliant format
- Archived legacy directory structures
- Ensured all new files follow <brandName>_<source>_<dateTime>.md format

Testing:
- Successfully captured Instagram posts 1-1000 with images
- Launched next batch (posts 1001-2000) currently in progress
- Verified thumbnail downloads for YouTube and Podcast content

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 12:54:21 -03:00

238 lines
No EOL
7.6 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Production script with cumulative markdown and image downloads.
Uses cumulative updates for all sources.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails
from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper
from src.instagram_scraper_cumulative import InstagramScraperCumulative
from src.rss_scraper_with_images import RSSScraperPodcastWithImages
from src.wordpress_scraper import WordPressScraper
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
from src.base_scraper import ScraperConfig
from src.cumulative_markdown_manager import CumulativeMarkdownManager
from datetime import datetime
import pytz
import time
import logging
import subprocess
import os
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('logs/production_cumulative.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger('production_cumulative')
def get_atlantic_timestamp() -> str:
"""Get current timestamp in Atlantic timezone for file naming."""
tz = pytz.timezone('America/Halifax')
return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S')
def run_instagram_incremental():
"""Run Instagram incremental update with cumulative markdown."""
logger.info("=" * 60)
logger.info("INSTAGRAM INCREMENTAL UPDATE (CUMULATIVE)")
logger.info("=" * 60)
if not os.getenv('INSTAGRAM_USERNAME'):
logger.warning("Instagram not configured")
return False, 0, None
config = ScraperConfig(
source_name='Instagram',
brand_name='hvacnkowitall',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = InstagramScraperCumulative(config)
return scraper.run_incremental(max_posts=50) # Check for 50 new posts
except Exception as e:
logger.error(f"Instagram error: {e}")
return False, 0, None
def run_youtube_incremental():
"""Run YouTube incremental update with thumbnails."""
logger.info("=" * 60)
logger.info("YOUTUBE INCREMENTAL UPDATE")
logger.info("=" * 60)
config = ScraperConfig(
source_name='YouTube',
brand_name='hvacnkowitall',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = YouTubeAPIScraperWithThumbnails(config)
videos = scraper.fetch_content(max_posts=20) # Check for 20 new videos
if videos:
manager = CumulativeMarkdownManager(config)
output_file = manager.update_cumulative_file(videos, 'YouTube')
thumb_count = sum(1 for v in videos if v.get('local_thumbnail'))
logger.info(f"✅ YouTube: {len(videos)} videos, {thumb_count} thumbnails")
return True, len(videos), output_file
else:
logger.info("No new YouTube videos")
return False, 0, None
except Exception as e:
logger.error(f"YouTube error: {e}")
return False, 0, None
def run_podcast_incremental():
"""Run Podcast incremental update with thumbnails."""
logger.info("=" * 60)
logger.info("PODCAST INCREMENTAL UPDATE")
logger.info("=" * 60)
if not os.getenv('PODCAST_RSS_URL'):
logger.warning("Podcast not configured")
return False, 0, None
config = ScraperConfig(
source_name='Podcast',
brand_name='hvacnkowitall',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = RSSScraperPodcastWithImages(config)
items = scraper.fetch_content(max_items=10) # Check for 10 new episodes
if items:
manager = CumulativeMarkdownManager(config)
output_file = manager.update_cumulative_file(items, 'Podcast')
thumb_count = sum(1 for item in items if item.get('local_thumbnail'))
logger.info(f"✅ Podcast: {len(items)} episodes, {thumb_count} thumbnails")
return True, len(items), output_file
else:
logger.info("No new podcast episodes")
return False, 0, None
except Exception as e:
logger.error(f"Podcast error: {e}")
return False, 0, None
def sync_to_nas_with_images():
"""Sync markdown files AND images to NAS."""
logger.info("\n" + "=" * 60)
logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES")
logger.info("=" * 60)
nas_base = Path('/mnt/nas/hvacknowitall')
try:
# Sync markdown files
local_current = Path('data/markdown_current')
nas_current = nas_base / 'markdown_current'
if local_current.exists() and any(local_current.glob('*.md')):
nas_current.mkdir(parents=True, exist_ok=True)
cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
str(local_current) + '/', str(nas_current) + '/']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"✅ Markdown files synced to NAS")
else:
logger.warning(f"Markdown sync warning: {result.stderr}")
# Sync media files
local_media = Path('data/media')
nas_media = nas_base / 'media'
if local_media.exists():
nas_media.mkdir(parents=True, exist_ok=True)
cmd = ['rsync', '-av',
'--include=*/',
'--include=*.jpg', '--include=*.jpeg',
'--include=*.png', '--include=*.gif',
'--exclude=*',
str(local_media) + '/', str(nas_media) + '/']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"✅ Media files synced to NAS")
except Exception as e:
logger.error(f"Failed to sync to NAS: {e}")
def main():
"""Main production run with cumulative updates and images."""
logger.info("=" * 70)
logger.info("HVAC KNOW IT ALL - CUMULATIVE PRODUCTION")
logger.info("With Image Downloads and Cumulative Markdown")
logger.info("=" * 70)
atlantic_tz = pytz.timezone('America/Halifax')
start_time = datetime.now(atlantic_tz)
logger.info(f"Started at: {start_time.isoformat()}")
# Track results
results = {}
# Run incremental updates
success, count, file = run_instagram_incremental()
results['Instagram'] = {'success': success, 'count': count, 'file': file}
time.sleep(2)
success, count, file = run_youtube_incremental()
results['YouTube'] = {'success': success, 'count': count, 'file': file}
time.sleep(2)
success, count, file = run_podcast_incremental()
results['Podcast'] = {'success': success, 'count': count, 'file': file}
# Also run MailChimp (already has cumulative support)
# ... (add MailChimp, WordPress, TikTok as needed)
# Sync to NAS
sync_to_nas_with_images()
# Summary
logger.info("\n" + "=" * 60)
logger.info("PRODUCTION SUMMARY")
logger.info("=" * 60)
for source, result in results.items():
if result['success']:
logger.info(f"{source}: {result['count']} items")
else:
logger.info(f" {source}: No new items")
logger.info("=" * 60)
if __name__ == "__main__":
main()