#!/usr/bin/env python3 """ Production script with comprehensive image downloading for all sources. Downloads thumbnails and images from Instagram, YouTube, and Podcasts. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper from src.instagram_scraper_with_images import InstagramScraperWithImages from src.rss_scraper_with_images import RSSScraperPodcastWithImages from src.wordpress_scraper import WordPressScraper from src.tiktok_scraper_advanced import TikTokScraperAdvanced from src.base_scraper import ScraperConfig from src.cumulative_markdown_manager import CumulativeMarkdownManager from datetime import datetime import pytz import time import logging import subprocess import os # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('logs/production_with_images.log'), logging.StreamHandler() ] ) logger = logging.getLogger('production_with_images') def get_atlantic_timestamp() -> str: """Get current timestamp in Atlantic timezone for file naming.""" tz = pytz.timezone('America/Halifax') return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S') def run_youtube_with_thumbnails(): """Run YouTube API scraper with thumbnail downloads.""" logger.info("=" * 60) logger.info("YOUTUBE API SCRAPER WITH THUMBNAILS") logger.info("=" * 60) timestamp = get_atlantic_timestamp() config = ScraperConfig( source_name='YouTube', brand_name='hkia', data_dir=Path('data'), logs_dir=Path('logs'), timezone='America/Halifax' ) try: scraper = YouTubeAPIScraperWithThumbnails(config) # Fetch videos with thumbnails logger.info("Fetching YouTube videos and downloading thumbnails...") videos = scraper.fetch_content(max_posts=100) # Limit for testing if videos: # Process cumulative markdown manager = CumulativeMarkdownManager(config) output_file = manager.update_cumulative_file(videos, 'YouTube') logger.info(f"✅ YouTube completed: {len(videos)} videos") logger.info(f" Output: {output_file}") # Count downloaded thumbnails thumb_count = sum(1 for v in videos if v.get('local_thumbnail')) logger.info(f" Thumbnails downloaded: {thumb_count}") return True, len(videos), output_file else: logger.warning("No YouTube videos fetched") return False, 0, None except Exception as e: logger.error(f"YouTube scraper error: {e}") import traceback traceback.print_exc() return False, 0, None def run_instagram_with_images(): """Run Instagram scraper with image downloads.""" logger.info("=" * 60) logger.info("INSTAGRAM SCRAPER WITH IMAGES") logger.info("=" * 60) if not os.getenv('INSTAGRAM_USERNAME'): logger.warning("Instagram not configured (INSTAGRAM_USERNAME missing)") return False, 0, None timestamp = get_atlantic_timestamp() config = ScraperConfig( source_name='Instagram', brand_name='hkia', data_dir=Path('data'), logs_dir=Path('logs'), timezone='America/Halifax' ) try: scraper = InstagramScraperWithImages(config) # Fetch posts with images (limited for testing) logger.info("Fetching Instagram posts and downloading images...") items = scraper.fetch_content(max_posts=20) # Start with 20 for testing if items: # Process cumulative markdown manager = CumulativeMarkdownManager(config) output_file = manager.update_cumulative_file(items, 'Instagram') logger.info(f"✅ Instagram completed: {len(items)} posts") logger.info(f" Output: {output_file}") # Count downloaded images img_count = sum(len(item.get('local_images', [])) for item in items) logger.info(f" Images downloaded: {img_count}") return True, len(items), output_file else: logger.warning("No Instagram posts fetched") return False, 0, None except Exception as e: logger.error(f"Instagram scraper error: {e}") import traceback traceback.print_exc() return False, 0, None def run_podcast_with_thumbnails(): """Run Podcast RSS scraper with thumbnail downloads.""" logger.info("=" * 60) logger.info("PODCAST RSS SCRAPER WITH THUMBNAILS") logger.info("=" * 60) if not os.getenv('PODCAST_RSS_URL'): logger.warning("Podcast not configured (PODCAST_RSS_URL missing)") return False, 0, None timestamp = get_atlantic_timestamp() config = ScraperConfig( source_name='Podcast', brand_name='hkia', data_dir=Path('data'), logs_dir=Path('logs'), timezone='America/Halifax' ) try: scraper = RSSScraperPodcastWithImages(config) # Fetch episodes with thumbnails logger.info("Fetching podcast episodes and downloading thumbnails...") items = scraper.fetch_content(max_items=50) # Limit for testing if items: # Process cumulative markdown manager = CumulativeMarkdownManager(config) output_file = manager.update_cumulative_file(items, 'Podcast') logger.info(f"✅ Podcast completed: {len(items)} episodes") logger.info(f" Output: {output_file}") # Count downloaded thumbnails thumb_count = sum(1 for item in items if item.get('local_thumbnail')) logger.info(f" Thumbnails downloaded: {thumb_count}") return True, len(items), output_file else: logger.warning("No podcast episodes fetched") return False, 0, None except Exception as e: logger.error(f"Podcast scraper error: {e}") import traceback traceback.print_exc() return False, 0, None def sync_to_nas_with_images(): """Sync markdown files AND images to NAS.""" logger.info("\n" + "=" * 60) logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES") logger.info("=" * 60) nas_base = Path('/mnt/nas/hkia') try: # Sync markdown files local_current = Path('data/markdown_current') nas_current = nas_base / 'markdown_current' if local_current.exists() and any(local_current.glob('*.md')): nas_current.mkdir(parents=True, exist_ok=True) # Sync markdown files cmd = ['rsync', '-av', '--include=*.md', '--exclude=*', str(local_current) + '/', str(nas_current) + '/'] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f"✅ Markdown files synced to NAS: {nas_current}") md_count = len(list(nas_current.glob('*.md'))) logger.info(f" Total markdown files: {md_count}") else: logger.warning(f"Markdown sync warning: {result.stderr}") # Sync media files local_media = Path('data/media') nas_media = nas_base / 'media' if local_media.exists(): nas_media.mkdir(parents=True, exist_ok=True) # Sync all image files (jpg, jpeg, png, gif) cmd = ['rsync', '-av', '--include=*/', '--include=*.jpg', '--include=*.jpeg', '--include=*.png', '--include=*.gif', '--exclude=*', str(local_media) + '/', str(nas_media) + '/'] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f"✅ Media files synced to NAS: {nas_media}") # Count images per source for source_dir in nas_media.glob('*'): if source_dir.is_dir(): img_count = len(list(source_dir.glob('*.jpg'))) + \ len(list(source_dir.glob('*.jpeg'))) + \ len(list(source_dir.glob('*.png'))) + \ len(list(source_dir.glob('*.gif'))) if img_count > 0: logger.info(f" {source_dir.name}: {img_count} images") else: logger.warning(f"Media sync warning: {result.stderr}") # Sync archives for source in ['YouTube', 'MailChimp', 'Instagram', 'Podcast', 'WordPress', 'TikTok']: local_archive = Path(f'data/markdown_archives/{source}') nas_archive = nas_base / f'markdown_archives/{source}' if local_archive.exists() and any(local_archive.glob('*.md')): nas_archive.mkdir(parents=True, exist_ok=True) cmd = ['rsync', '-av', '--include=*.md', '--exclude=*', str(local_archive) + '/', str(nas_archive) + '/'] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f"✅ {source} archives synced to NAS") except Exception as e: logger.error(f"Failed to sync to NAS: {e}") def main(): """Main production run with image downloads.""" logger.info("=" * 70) logger.info("HKIA - PRODUCTION WITH IMAGE DOWNLOADS") logger.info("Downloads all thumbnails and images (no videos)") logger.info("=" * 70) atlantic_tz = pytz.timezone('America/Halifax') start_time = datetime.now(atlantic_tz) logger.info(f"Started at: {start_time.isoformat()}") # Track results results = { 'YouTube': {'success': False, 'count': 0, 'file': None}, 'Instagram': {'success': False, 'count': 0, 'file': None}, 'Podcast': {'success': False, 'count': 0, 'file': None} } # Run YouTube with thumbnails success, count, output_file = run_youtube_with_thumbnails() results['YouTube'] = {'success': success, 'count': count, 'file': output_file} # Wait a bit between scrapers time.sleep(2) # Run Instagram with images success, count, output_file = run_instagram_with_images() results['Instagram'] = {'success': success, 'count': count, 'file': output_file} # Wait a bit between scrapers time.sleep(2) # Run Podcast with thumbnails success, count, output_file = run_podcast_with_thumbnails() results['Podcast'] = {'success': success, 'count': count, 'file': output_file} # Sync to NAS including images sync_to_nas_with_images() # Summary end_time = datetime.now(atlantic_tz) duration = (end_time - start_time).total_seconds() logger.info("\n" + "=" * 60) logger.info("PRODUCTION RUN SUMMARY") logger.info("=" * 60) for source, result in results.items(): if result['success']: logger.info(f"✅ {source}: {result['count']} items") if result['file']: logger.info(f" File: {result['file']}") else: logger.info(f"❌ {source}: Failed") # Count total images downloaded media_dir = Path('data/media') total_images = 0 if media_dir.exists(): for source_dir in media_dir.glob('*'): if source_dir.is_dir(): img_count = len(list(source_dir.glob('*.jpg'))) + \ len(list(source_dir.glob('*.jpeg'))) + \ len(list(source_dir.glob('*.png'))) + \ len(list(source_dir.glob('*.gif'))) total_images += img_count logger.info(f"\nTotal images downloaded: {total_images}") logger.info(f"Duration: {duration:.1f} seconds") logger.info("=" * 60) if __name__ == "__main__": main()