hvac-kia-content/run_production_with_images.py
Ben Reed 2edc359b5e feat: Implement comprehensive image downloading and cumulative markdown system
Major Updates:
- Added image downloading for Instagram, YouTube, and Podcast scrapers
- Implemented cumulative markdown system for maintaining single source-of-truth files
- Deployed production services with automatic NAS sync for images
- Standardized file naming conventions per project specification

New Features:
- Instagram: Downloads all post images, carousel images, and video thumbnails
- YouTube: Downloads video thumbnails (highest quality available)
- Podcast: Downloads episode artwork/thumbnails
- Consistent image naming: {source}_{item_id}_{type}.{ext}
- Cumulative markdown updates to prevent file proliferation
- Automatic media sync to NAS at /mnt/nas/hvacknowitall/media/

Production Deployment:
- New systemd services: hvac-content-images-8am and hvac-content-images-12pm
- Runs twice daily at 8 AM and 12 PM Atlantic time
- Comprehensive rsync for both markdown and media files

File Structure Compliance:
- Renamed Instagram backlog to spec-compliant format
- Archived legacy directory structures
- Ensured all new files follow <brandName>_<source>_<dateTime>.md format

Testing:
- Successfully captured Instagram posts 1-1000 with images
- Launched next batch (posts 1001-2000) currently in progress
- Verified thumbnail downloads for YouTube and Podcast content

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 12:54:21 -03:00

344 lines
No EOL
12 KiB
Python

#!/usr/bin/env python3
"""
Production script with comprehensive image downloading for all sources.
Downloads thumbnails and images from Instagram, YouTube, and Podcasts.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails
from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper
from src.instagram_scraper_with_images import InstagramScraperWithImages
from src.rss_scraper_with_images import RSSScraperPodcastWithImages
from src.wordpress_scraper import WordPressScraper
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
from src.base_scraper import ScraperConfig
from src.cumulative_markdown_manager import CumulativeMarkdownManager
from datetime import datetime
import pytz
import time
import logging
import subprocess
import os
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('logs/production_with_images.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger('production_with_images')
def get_atlantic_timestamp() -> str:
"""Get current timestamp in Atlantic timezone for file naming."""
tz = pytz.timezone('America/Halifax')
return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S')
def run_youtube_with_thumbnails():
"""Run YouTube API scraper with thumbnail downloads."""
logger.info("=" * 60)
logger.info("YOUTUBE API SCRAPER WITH THUMBNAILS")
logger.info("=" * 60)
timestamp = get_atlantic_timestamp()
config = ScraperConfig(
source_name='YouTube',
brand_name='hvacnkowitall',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = YouTubeAPIScraperWithThumbnails(config)
# Fetch videos with thumbnails
logger.info("Fetching YouTube videos and downloading thumbnails...")
videos = scraper.fetch_content(max_posts=100) # Limit for testing
if videos:
# Process cumulative markdown
manager = CumulativeMarkdownManager(config)
output_file = manager.update_cumulative_file(videos, 'YouTube')
logger.info(f"✅ YouTube completed: {len(videos)} videos")
logger.info(f" Output: {output_file}")
# Count downloaded thumbnails
thumb_count = sum(1 for v in videos if v.get('local_thumbnail'))
logger.info(f" Thumbnails downloaded: {thumb_count}")
return True, len(videos), output_file
else:
logger.warning("No YouTube videos fetched")
return False, 0, None
except Exception as e:
logger.error(f"YouTube scraper error: {e}")
import traceback
traceback.print_exc()
return False, 0, None
def run_instagram_with_images():
"""Run Instagram scraper with image downloads."""
logger.info("=" * 60)
logger.info("INSTAGRAM SCRAPER WITH IMAGES")
logger.info("=" * 60)
if not os.getenv('INSTAGRAM_USERNAME'):
logger.warning("Instagram not configured (INSTAGRAM_USERNAME missing)")
return False, 0, None
timestamp = get_atlantic_timestamp()
config = ScraperConfig(
source_name='Instagram',
brand_name='hvacnkowitall',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = InstagramScraperWithImages(config)
# Fetch posts with images (limited for testing)
logger.info("Fetching Instagram posts and downloading images...")
items = scraper.fetch_content(max_posts=20) # Start with 20 for testing
if items:
# Process cumulative markdown
manager = CumulativeMarkdownManager(config)
output_file = manager.update_cumulative_file(items, 'Instagram')
logger.info(f"✅ Instagram completed: {len(items)} posts")
logger.info(f" Output: {output_file}")
# Count downloaded images
img_count = sum(len(item.get('local_images', [])) for item in items)
logger.info(f" Images downloaded: {img_count}")
return True, len(items), output_file
else:
logger.warning("No Instagram posts fetched")
return False, 0, None
except Exception as e:
logger.error(f"Instagram scraper error: {e}")
import traceback
traceback.print_exc()
return False, 0, None
def run_podcast_with_thumbnails():
"""Run Podcast RSS scraper with thumbnail downloads."""
logger.info("=" * 60)
logger.info("PODCAST RSS SCRAPER WITH THUMBNAILS")
logger.info("=" * 60)
if not os.getenv('PODCAST_RSS_URL'):
logger.warning("Podcast not configured (PODCAST_RSS_URL missing)")
return False, 0, None
timestamp = get_atlantic_timestamp()
config = ScraperConfig(
source_name='Podcast',
brand_name='hvacnkowitall',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = RSSScraperPodcastWithImages(config)
# Fetch episodes with thumbnails
logger.info("Fetching podcast episodes and downloading thumbnails...")
items = scraper.fetch_content(max_items=50) # Limit for testing
if items:
# Process cumulative markdown
manager = CumulativeMarkdownManager(config)
output_file = manager.update_cumulative_file(items, 'Podcast')
logger.info(f"✅ Podcast completed: {len(items)} episodes")
logger.info(f" Output: {output_file}")
# Count downloaded thumbnails
thumb_count = sum(1 for item in items if item.get('local_thumbnail'))
logger.info(f" Thumbnails downloaded: {thumb_count}")
return True, len(items), output_file
else:
logger.warning("No podcast episodes fetched")
return False, 0, None
except Exception as e:
logger.error(f"Podcast scraper error: {e}")
import traceback
traceback.print_exc()
return False, 0, None
def sync_to_nas_with_images():
"""Sync markdown files AND images to NAS."""
logger.info("\n" + "=" * 60)
logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES")
logger.info("=" * 60)
nas_base = Path('/mnt/nas/hvacknowitall')
try:
# Sync markdown files
local_current = Path('data/markdown_current')
nas_current = nas_base / 'markdown_current'
if local_current.exists() and any(local_current.glob('*.md')):
nas_current.mkdir(parents=True, exist_ok=True)
# Sync markdown files
cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
str(local_current) + '/', str(nas_current) + '/']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"✅ Markdown files synced to NAS: {nas_current}")
md_count = len(list(nas_current.glob('*.md')))
logger.info(f" Total markdown files: {md_count}")
else:
logger.warning(f"Markdown sync warning: {result.stderr}")
# Sync media files
local_media = Path('data/media')
nas_media = nas_base / 'media'
if local_media.exists():
nas_media.mkdir(parents=True, exist_ok=True)
# Sync all image files (jpg, jpeg, png, gif)
cmd = ['rsync', '-av',
'--include=*/',
'--include=*.jpg', '--include=*.jpeg',
'--include=*.png', '--include=*.gif',
'--exclude=*',
str(local_media) + '/', str(nas_media) + '/']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"✅ Media files synced to NAS: {nas_media}")
# Count images per source
for source_dir in nas_media.glob('*'):
if source_dir.is_dir():
img_count = len(list(source_dir.glob('*.jpg'))) + \
len(list(source_dir.glob('*.jpeg'))) + \
len(list(source_dir.glob('*.png'))) + \
len(list(source_dir.glob('*.gif')))
if img_count > 0:
logger.info(f" {source_dir.name}: {img_count} images")
else:
logger.warning(f"Media sync warning: {result.stderr}")
# Sync archives
for source in ['YouTube', 'MailChimp', 'Instagram', 'Podcast', 'WordPress', 'TikTok']:
local_archive = Path(f'data/markdown_archives/{source}')
nas_archive = nas_base / f'markdown_archives/{source}'
if local_archive.exists() and any(local_archive.glob('*.md')):
nas_archive.mkdir(parents=True, exist_ok=True)
cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
str(local_archive) + '/', str(nas_archive) + '/']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"{source} archives synced to NAS")
except Exception as e:
logger.error(f"Failed to sync to NAS: {e}")
def main():
"""Main production run with image downloads."""
logger.info("=" * 70)
logger.info("HVAC KNOW IT ALL - PRODUCTION WITH IMAGE DOWNLOADS")
logger.info("Downloads all thumbnails and images (no videos)")
logger.info("=" * 70)
atlantic_tz = pytz.timezone('America/Halifax')
start_time = datetime.now(atlantic_tz)
logger.info(f"Started at: {start_time.isoformat()}")
# Track results
results = {
'YouTube': {'success': False, 'count': 0, 'file': None},
'Instagram': {'success': False, 'count': 0, 'file': None},
'Podcast': {'success': False, 'count': 0, 'file': None}
}
# Run YouTube with thumbnails
success, count, output_file = run_youtube_with_thumbnails()
results['YouTube'] = {'success': success, 'count': count, 'file': output_file}
# Wait a bit between scrapers
time.sleep(2)
# Run Instagram with images
success, count, output_file = run_instagram_with_images()
results['Instagram'] = {'success': success, 'count': count, 'file': output_file}
# Wait a bit between scrapers
time.sleep(2)
# Run Podcast with thumbnails
success, count, output_file = run_podcast_with_thumbnails()
results['Podcast'] = {'success': success, 'count': count, 'file': output_file}
# Sync to NAS including images
sync_to_nas_with_images()
# Summary
end_time = datetime.now(atlantic_tz)
duration = (end_time - start_time).total_seconds()
logger.info("\n" + "=" * 60)
logger.info("PRODUCTION RUN SUMMARY")
logger.info("=" * 60)
for source, result in results.items():
if result['success']:
logger.info(f"{source}: {result['count']} items")
if result['file']:
logger.info(f" File: {result['file']}")
else:
logger.info(f"{source}: Failed")
# Count total images downloaded
media_dir = Path('data/media')
total_images = 0
if media_dir.exists():
for source_dir in media_dir.glob('*'):
if source_dir.is_dir():
img_count = len(list(source_dir.glob('*.jpg'))) + \
len(list(source_dir.glob('*.jpeg'))) + \
len(list(source_dir.glob('*.png'))) + \
len(list(source_dir.glob('*.gif')))
total_images += img_count
logger.info(f"\nTotal images downloaded: {total_images}")
logger.info(f"Duration: {duration:.1f} seconds")
logger.info("=" * 60)
if __name__ == "__main__":
main()