hvac-kia-content/test_sources_simple.py

#!/usr/bin/env python3
"""
Simple test to check if each source can connect and fetch data.
"""

import os
import sys
from pathlib import Path
from dotenv import load_dotenv

# Add src to path
sys.path.insert(0, str(Path(__file__).parent))

from src.base_scraper import ScraperConfig
from src.wordpress_scraper import WordPressScraper
from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
from src.youtube_scraper import YouTubeScraper
from src.instagram_scraper import InstagramScraper
from src.tiktok_scraper import TikTokScraper


def test_source(scraper_class, name, limit=3):
    """Test if a source can fetch data."""
    print(f"\n{'='*50}")
    print(f"Testing {name}")
    print('='*50)

    config = ScraperConfig(
        source_name=name.lower(),
        brand_name="hvacknowitall",
        data_dir=Path("test_data"),
        logs_dir=Path("test_logs"),
        timezone="America/Halifax"
    )

    try:
        scraper = scraper_class(config)

        # Fetch with appropriate method
        if name == "YouTube":
            items = scraper.fetch_channel_videos(max_videos=limit)
        elif name == "Instagram":
            posts = scraper.fetch_posts(max_posts=limit)
            stories = scraper.fetch_stories()[:1]  # Just try 1 story
            items = posts + stories
        elif name == "TikTok":
            # TikTok is async, let's use fetch_content wrapper
            items = scraper.fetch_content()
            items = items[:limit] if items else []
        else:
            # WordPress and RSS scrapers
            items = scraper.fetch_content()
            items = items[:limit] if items else []

        if items:
            print(f"✅ SUCCESS: Fetched {len(items)} items")

            # Show first item
            if items:
                first = items[0]
                print(f"\nFirst item preview:")

                # Show key fields
                for key in ['title', 'description', 'caption', 'author', 'channel', 'date', 'publish_date', 'link', 'url']:
                    if key in first:
                        value = str(first[key])[:100]
                        if value:
                            print(f"  {key}: {value}")
        else:
            print(f"❌ FAILED: No items fetched")
            return False

        return True

    except Exception as e:
        print(f"❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    # Load environment
    load_dotenv()

    print("\n" + "#"*50)
    print("# TESTING ALL SOURCES - Simple Connection Test")
    print("#"*50)

    results = {}

    # Test each source
    if os.getenv('WORDPRESS_API_URL'):
        results['WordPress'] = test_source(WordPressScraper, "WordPress")

    if os.getenv('MAILCHIMP_RSS_URL'):
        results['MailChimp'] = test_source(RSSScraperMailChimp, "MailChimp")

    if os.getenv('PODCAST_RSS_URL'):
        results['Podcast'] = test_source(RSSScraperPodcast, "Podcast")

    if os.getenv('YOUTUBE_CHANNEL_URL'):
        results['YouTube'] = test_source(YouTubeScraper, "YouTube")

    if os.getenv('INSTAGRAM_USERNAME'):
        results['Instagram'] = test_source(InstagramScraper, "Instagram")

    if os.getenv('TIKTOK_USERNAME'):
        print("\n⚠️  TikTok requires Playwright browser automation")
        print("   This may take longer and could be blocked")
        results['TikTok'] = test_source(TikTokScraper, "TikTok", limit=2)

    # Summary
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)

    for source, success in results.items():
        status = "✅" if success else "❌"
        print(f"{status} {source}")

    total = len(results)
    passed = sum(1 for s in results.values() if s)
    print(f"\nTotal: {passed}/{total} sources working")


if __name__ == "__main__":
    main()