hvac-kia-content/fetch_more_youtube.py

#!/usr/bin/env python3
"""
Fetch additional YouTube videos to reach 1000 total
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.base_scraper import ScraperConfig
from src.youtube_scraper import YouTubeScraper
from datetime import datetime
import logging
import time

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('youtube_1000.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def main():
    """Fetch additional YouTube videos"""
    logger.info("🎥 Fetching additional YouTube videos to reach 1000 total")
    logger.info("Already have 200 videos, fetching 800 more...")
    logger.info("=" * 60)

    # Create config for backlog
    config = ScraperConfig(
        source_name="youtube",
        brand_name="hvacknowitall",
        data_dir=Path("data_production_backlog"),
        logs_dir=Path("logs_production_backlog"),
        timezone="America/Halifax"
    )

    # Initialize scraper
    scraper = YouTubeScraper(config)

    # Clear state to fetch all videos from beginning
    if scraper.state_file.exists():
        scraper.state_file.unlink()
        logger.info("Cleared state for full backlog capture")

    # Fetch 1000 videos (or all available if less)
    logger.info("Starting YouTube fetch - targeting 1000 videos total...")
    start_time = time.time()

    try:
        videos = scraper.fetch_channel_videos(max_videos=1000)

        if not videos:
            logger.error("No videos fetched")
            return False

        logger.info(f"✅ Fetched {len(videos)} videos")

        # Generate markdown
        markdown = scraper.format_markdown(videos)

        # Save with new timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"hvacknowitall_youtube_1000_backlog_{timestamp}.md"

        # Save to markdown directory
        output_dir = config.data_dir / "markdown_current"
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = output_dir / filename

        output_file.write_text(markdown, encoding='utf-8')
        logger.info(f"📄 Saved to: {output_file}")

        # Update state
        new_state = {
            'last_update': datetime.now().isoformat(),
            'last_item_count': len(videos),
            'backlog_captured': True,
            'total_videos': len(videos)
        }

        if videos:
            new_state['last_video_id'] = videos[-1].get('id')
            new_state['oldest_video_date'] = videos[-1].get('upload_date', '')

        scraper.save_state(new_state)

        # Statistics
        duration = time.time() - start_time
        logger.info("\n" + "=" * 60)
        logger.info("📊 YOUTUBE CAPTURE COMPLETE")
        logger.info(f"Total videos: {len(videos)}")
        logger.info(f"Duration: {duration:.1f} seconds")
        logger.info(f"Rate: {len(videos)/duration:.1f} videos/second")

        # Show date range
        if videos:
            newest_date = videos[0].get('upload_date', 'Unknown')
            oldest_date = videos[-1].get('upload_date', 'Unknown')
            logger.info(f"Date range: {oldest_date} to {newest_date}")

        # Check if we got all available videos
        if len(videos) < 1000:
            logger.info(f"⚠️ Channel has {len(videos)} total videos (less than 1000 requested)")
        else:
            logger.info("✅ Successfully fetched 1000 videos!")

        return True

    except Exception as e:
        logger.error(f"Error fetching videos: {e}")
        return False

if __name__ == "__main__":
    try:
        success = main()
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        logger.info("\nCapture interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.critical(f"Capture failed: {e}")
        sys.exit(2)