#!/usr/bin/env python3 """ Production script with cumulative markdown and image downloads. Uses cumulative updates for all sources. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper from src.instagram_scraper_cumulative import InstagramScraperCumulative from src.rss_scraper_with_images import RSSScraperPodcastWithImages from src.wordpress_scraper import WordPressScraper from src.tiktok_scraper_advanced import TikTokScraperAdvanced from src.base_scraper import ScraperConfig from src.cumulative_markdown_manager import CumulativeMarkdownManager from datetime import datetime import pytz import time import logging import subprocess import os # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('logs/production_cumulative.log'), logging.StreamHandler() ] ) logger = logging.getLogger('production_cumulative') def get_atlantic_timestamp() -> str: """Get current timestamp in Atlantic timezone for file naming.""" tz = pytz.timezone('America/Halifax') return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S') def run_instagram_incremental(): """Run Instagram incremental update with cumulative markdown.""" logger.info("=" * 60) logger.info("INSTAGRAM INCREMENTAL UPDATE (CUMULATIVE)") logger.info("=" * 60) if not os.getenv('INSTAGRAM_USERNAME'): logger.warning("Instagram not configured") return False, 0, None config = ScraperConfig( source_name='Instagram', brand_name='hkia', data_dir=Path('data'), logs_dir=Path('logs'), timezone='America/Halifax' ) try: scraper = InstagramScraperCumulative(config) return scraper.run_incremental(max_posts=50) # Check for 50 new posts except Exception as e: logger.error(f"Instagram error: {e}") return False, 0, None def run_youtube_incremental(): """Run YouTube incremental update with thumbnails.""" logger.info("=" * 60) logger.info("YOUTUBE INCREMENTAL UPDATE") logger.info("=" * 60) config = ScraperConfig( source_name='YouTube', brand_name='hkia', data_dir=Path('data'), logs_dir=Path('logs'), timezone='America/Halifax' ) try: scraper = YouTubeAPIScraperWithThumbnails(config) videos = scraper.fetch_content(max_posts=20) # Check for 20 new videos if videos: manager = CumulativeMarkdownManager(config) output_file = manager.update_cumulative_file(videos, 'YouTube') thumb_count = sum(1 for v in videos if v.get('local_thumbnail')) logger.info(f"✅ YouTube: {len(videos)} videos, {thumb_count} thumbnails") return True, len(videos), output_file else: logger.info("No new YouTube videos") return False, 0, None except Exception as e: logger.error(f"YouTube error: {e}") return False, 0, None def run_podcast_incremental(): """Run Podcast incremental update with thumbnails.""" logger.info("=" * 60) logger.info("PODCAST INCREMENTAL UPDATE") logger.info("=" * 60) if not os.getenv('PODCAST_RSS_URL'): logger.warning("Podcast not configured") return False, 0, None config = ScraperConfig( source_name='Podcast', brand_name='hkia', data_dir=Path('data'), logs_dir=Path('logs'), timezone='America/Halifax' ) try: scraper = RSSScraperPodcastWithImages(config) items = scraper.fetch_content(max_items=10) # Check for 10 new episodes if items: manager = CumulativeMarkdownManager(config) output_file = manager.update_cumulative_file(items, 'Podcast') thumb_count = sum(1 for item in items if item.get('local_thumbnail')) logger.info(f"✅ Podcast: {len(items)} episodes, {thumb_count} thumbnails") return True, len(items), output_file else: logger.info("No new podcast episodes") return False, 0, None except Exception as e: logger.error(f"Podcast error: {e}") return False, 0, None def sync_to_nas_with_images(): """Sync markdown files AND images to NAS.""" logger.info("\n" + "=" * 60) logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES") logger.info("=" * 60) nas_base = Path('/mnt/nas/hkia') try: # Sync markdown files local_current = Path('data/markdown_current') nas_current = nas_base / 'markdown_current' if local_current.exists() and any(local_current.glob('*.md')): nas_current.mkdir(parents=True, exist_ok=True) cmd = ['rsync', '-av', '--include=*.md', '--exclude=*', str(local_current) + '/', str(nas_current) + '/'] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f"✅ Markdown files synced to NAS") else: logger.warning(f"Markdown sync warning: {result.stderr}") # Sync media files local_media = Path('data/media') nas_media = nas_base / 'media' if local_media.exists(): nas_media.mkdir(parents=True, exist_ok=True) cmd = ['rsync', '-av', '--include=*/', '--include=*.jpg', '--include=*.jpeg', '--include=*.png', '--include=*.gif', '--exclude=*', str(local_media) + '/', str(nas_media) + '/'] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f"✅ Media files synced to NAS") except Exception as e: logger.error(f"Failed to sync to NAS: {e}") def main(): """Main production run with cumulative updates and images.""" logger.info("=" * 70) logger.info("HKIA - CUMULATIVE PRODUCTION") logger.info("With Image Downloads and Cumulative Markdown") logger.info("=" * 70) atlantic_tz = pytz.timezone('America/Halifax') start_time = datetime.now(atlantic_tz) logger.info(f"Started at: {start_time.isoformat()}") # Track results results = {} # Run incremental updates success, count, file = run_instagram_incremental() results['Instagram'] = {'success': success, 'count': count, 'file': file} time.sleep(2) success, count, file = run_youtube_incremental() results['YouTube'] = {'success': success, 'count': count, 'file': file} time.sleep(2) success, count, file = run_podcast_incremental() results['Podcast'] = {'success': success, 'count': count, 'file': file} # Also run MailChimp (already has cumulative support) # ... (add MailChimp, WordPress, TikTok as needed) # Sync to NAS sync_to_nas_with_images() # Summary logger.info("\n" + "=" * 60) logger.info("PRODUCTION SUMMARY") logger.info("=" * 60) for source, result in results.items(): if result['success']: logger.info(f"✅ {source}: {result['count']} items") else: logger.info(f"ℹ️ {source}: No new items") logger.info("=" * 60) if __name__ == "__main__": main()