hvac-kia-content/run_production_with_images.py

#!/usr/bin/env python3
"""
Production script with comprehensive image downloading for all sources.
Downloads thumbnails and images from Instagram, YouTube, and Podcasts.
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails
from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper
from src.instagram_scraper_with_images import InstagramScraperWithImages
from src.rss_scraper_with_images import RSSScraperPodcastWithImages
from src.wordpress_scraper import WordPressScraper
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
from src.base_scraper import ScraperConfig
from src.cumulative_markdown_manager import CumulativeMarkdownManager
from datetime import datetime
import pytz
import time
import logging
import subprocess
import os

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/production_with_images.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('production_with_images')


def get_atlantic_timestamp() -> str:
    """Get current timestamp in Atlantic timezone for file naming."""
    tz = pytz.timezone('America/Halifax')
    return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S')


def run_youtube_with_thumbnails():
    """Run YouTube API scraper with thumbnail downloads."""
    logger.info("=" * 60)
    logger.info("YOUTUBE API SCRAPER WITH THUMBNAILS")
    logger.info("=" * 60)

    timestamp = get_atlantic_timestamp()

    config = ScraperConfig(
        source_name='YouTube',
        brand_name='hvacnkowitall',
        data_dir=Path('data'),
        logs_dir=Path('logs'),
        timezone='America/Halifax'
    )

    try:
        scraper = YouTubeAPIScraperWithThumbnails(config)

        # Fetch videos with thumbnails
        logger.info("Fetching YouTube videos and downloading thumbnails...")
        videos = scraper.fetch_content(max_posts=100)  # Limit for testing

        if videos:
            # Process cumulative markdown
            manager = CumulativeMarkdownManager(config)
            output_file = manager.update_cumulative_file(videos, 'YouTube')

            logger.info(f"✅ YouTube completed: {len(videos)} videos")
            logger.info(f"   Output: {output_file}")

            # Count downloaded thumbnails
            thumb_count = sum(1 for v in videos if v.get('local_thumbnail'))
            logger.info(f"   Thumbnails downloaded: {thumb_count}")

            return True, len(videos), output_file
        else:
            logger.warning("No YouTube videos fetched")
            return False, 0, None

    except Exception as e:
        logger.error(f"YouTube scraper error: {e}")
        import traceback
        traceback.print_exc()
        return False, 0, None


def run_instagram_with_images():
    """Run Instagram scraper with image downloads."""
    logger.info("=" * 60)
    logger.info("INSTAGRAM SCRAPER WITH IMAGES")
    logger.info("=" * 60)

    if not os.getenv('INSTAGRAM_USERNAME'):
        logger.warning("Instagram not configured (INSTAGRAM_USERNAME missing)")
        return False, 0, None

    timestamp = get_atlantic_timestamp()

    config = ScraperConfig(
        source_name='Instagram',
        brand_name='hvacnkowitall',
        data_dir=Path('data'),
        logs_dir=Path('logs'),
        timezone='America/Halifax'
    )

    try:
        scraper = InstagramScraperWithImages(config)

        # Fetch posts with images (limited for testing)
        logger.info("Fetching Instagram posts and downloading images...")
        items = scraper.fetch_content(max_posts=20)  # Start with 20 for testing

        if items:
            # Process cumulative markdown
            manager = CumulativeMarkdownManager(config)
            output_file = manager.update_cumulative_file(items, 'Instagram')

            logger.info(f"✅ Instagram completed: {len(items)} posts")
            logger.info(f"   Output: {output_file}")

            # Count downloaded images
            img_count = sum(len(item.get('local_images', [])) for item in items)
            logger.info(f"   Images downloaded: {img_count}")

            return True, len(items), output_file
        else:
            logger.warning("No Instagram posts fetched")
            return False, 0, None

    except Exception as e:
        logger.error(f"Instagram scraper error: {e}")
        import traceback
        traceback.print_exc()
        return False, 0, None


def run_podcast_with_thumbnails():
    """Run Podcast RSS scraper with thumbnail downloads."""
    logger.info("=" * 60)
    logger.info("PODCAST RSS SCRAPER WITH THUMBNAILS")
    logger.info("=" * 60)

    if not os.getenv('PODCAST_RSS_URL'):
        logger.warning("Podcast not configured (PODCAST_RSS_URL missing)")
        return False, 0, None

    timestamp = get_atlantic_timestamp()

    config = ScraperConfig(
        source_name='Podcast',
        brand_name='hvacnkowitall',
        data_dir=Path('data'),
        logs_dir=Path('logs'),
        timezone='America/Halifax'
    )

    try:
        scraper = RSSScraperPodcastWithImages(config)

        # Fetch episodes with thumbnails
        logger.info("Fetching podcast episodes and downloading thumbnails...")
        items = scraper.fetch_content(max_items=50)  # Limit for testing

        if items:
            # Process cumulative markdown
            manager = CumulativeMarkdownManager(config)
            output_file = manager.update_cumulative_file(items, 'Podcast')

            logger.info(f"✅ Podcast completed: {len(items)} episodes")
            logger.info(f"   Output: {output_file}")

            # Count downloaded thumbnails
            thumb_count = sum(1 for item in items if item.get('local_thumbnail'))
            logger.info(f"   Thumbnails downloaded: {thumb_count}")

            return True, len(items), output_file
        else:
            logger.warning("No podcast episodes fetched")
            return False, 0, None

    except Exception as e:
        logger.error(f"Podcast scraper error: {e}")
        import traceback
        traceback.print_exc()
        return False, 0, None


def sync_to_nas_with_images():
    """Sync markdown files AND images to NAS."""
    logger.info("\n" + "=" * 60)
    logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES")
    logger.info("=" * 60)

    nas_base = Path('/mnt/nas/hvacknowitall')

    try:
        # Sync markdown files
        local_current = Path('data/markdown_current')
        nas_current = nas_base / 'markdown_current'

        if local_current.exists() and any(local_current.glob('*.md')):
            nas_current.mkdir(parents=True, exist_ok=True)

            # Sync markdown files
            cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
                   str(local_current) + '/', str(nas_current) + '/']
            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode == 0:
                logger.info(f"✅ Markdown files synced to NAS: {nas_current}")
                md_count = len(list(nas_current.glob('*.md')))
                logger.info(f"   Total markdown files: {md_count}")
            else:
                logger.warning(f"Markdown sync warning: {result.stderr}")

        # Sync media files
        local_media = Path('data/media')
        nas_media = nas_base / 'media'

        if local_media.exists():
            nas_media.mkdir(parents=True, exist_ok=True)

            # Sync all image files (jpg, jpeg, png, gif)
            cmd = ['rsync', '-av',
                   '--include=*/',
                   '--include=*.jpg', '--include=*.jpeg',
                   '--include=*.png', '--include=*.gif',
                   '--exclude=*',
                   str(local_media) + '/', str(nas_media) + '/']
            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode == 0:
                logger.info(f"✅ Media files synced to NAS: {nas_media}")

                # Count images per source
                for source_dir in nas_media.glob('*'):
                    if source_dir.is_dir():
                        img_count = len(list(source_dir.glob('*.jpg'))) + \
                                   len(list(source_dir.glob('*.jpeg'))) + \
                                   len(list(source_dir.glob('*.png'))) + \
                                   len(list(source_dir.glob('*.gif')))
                        if img_count > 0:
                            logger.info(f"   {source_dir.name}: {img_count} images")
            else:
                logger.warning(f"Media sync warning: {result.stderr}")

        # Sync archives
        for source in ['YouTube', 'MailChimp', 'Instagram', 'Podcast', 'WordPress', 'TikTok']:
            local_archive = Path(f'data/markdown_archives/{source}')
            nas_archive = nas_base / f'markdown_archives/{source}'

            if local_archive.exists() and any(local_archive.glob('*.md')):
                nas_archive.mkdir(parents=True, exist_ok=True)

                cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
                       str(local_archive) + '/', str(nas_archive) + '/']
                result = subprocess.run(cmd, capture_output=True, text=True)

                if result.returncode == 0:
                    logger.info(f"✅ {source} archives synced to NAS")

    except Exception as e:
        logger.error(f"Failed to sync to NAS: {e}")


def main():
    """Main production run with image downloads."""
    logger.info("=" * 70)
    logger.info("HVAC KNOW IT ALL - PRODUCTION WITH IMAGE DOWNLOADS")
    logger.info("Downloads all thumbnails and images (no videos)")
    logger.info("=" * 70)

    atlantic_tz = pytz.timezone('America/Halifax')
    start_time = datetime.now(atlantic_tz)
    logger.info(f"Started at: {start_time.isoformat()}")

    # Track results
    results = {
        'YouTube': {'success': False, 'count': 0, 'file': None},
        'Instagram': {'success': False, 'count': 0, 'file': None},
        'Podcast': {'success': False, 'count': 0, 'file': None}
    }

    # Run YouTube with thumbnails
    success, count, output_file = run_youtube_with_thumbnails()
    results['YouTube'] = {'success': success, 'count': count, 'file': output_file}

    # Wait a bit between scrapers
    time.sleep(2)

    # Run Instagram with images
    success, count, output_file = run_instagram_with_images()
    results['Instagram'] = {'success': success, 'count': count, 'file': output_file}

    # Wait a bit between scrapers
    time.sleep(2)

    # Run Podcast with thumbnails
    success, count, output_file = run_podcast_with_thumbnails()
    results['Podcast'] = {'success': success, 'count': count, 'file': output_file}

    # Sync to NAS including images
    sync_to_nas_with_images()

    # Summary
    end_time = datetime.now(atlantic_tz)
    duration = (end_time - start_time).total_seconds()

    logger.info("\n" + "=" * 60)
    logger.info("PRODUCTION RUN SUMMARY")
    logger.info("=" * 60)

    for source, result in results.items():
        if result['success']:
            logger.info(f"✅ {source}: {result['count']} items")
            if result['file']:
                logger.info(f"   File: {result['file']}")
        else:
            logger.info(f"❌ {source}: Failed")

    # Count total images downloaded
    media_dir = Path('data/media')
    total_images = 0
    if media_dir.exists():
        for source_dir in media_dir.glob('*'):
            if source_dir.is_dir():
                img_count = len(list(source_dir.glob('*.jpg'))) + \
                           len(list(source_dir.glob('*.jpeg'))) + \
                           len(list(source_dir.glob('*.png'))) + \
                           len(list(source_dir.glob('*.gif')))
                total_images += img_count

    logger.info(f"\nTotal images downloaded: {total_images}")
    logger.info(f"Duration: {duration:.1f} seconds")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()