#!/usr/bin/env python3 """ Simple test to check if each source can connect and fetch data. """ import os import sys from pathlib import Path from dotenv import load_dotenv # Add src to path sys.path.insert(0, str(Path(__file__).parent)) from src.base_scraper import ScraperConfig from src.wordpress_scraper import WordPressScraper from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast from src.youtube_scraper import YouTubeScraper from src.instagram_scraper import InstagramScraper from src.tiktok_scraper import TikTokScraper def test_source(scraper_class, name, limit=3): """Test if a source can fetch data.""" print(f"\n{'='*50}") print(f"Testing {name}") print('='*50) config = ScraperConfig( source_name=name.lower(), brand_name="hvacknowitall", data_dir=Path("test_data"), logs_dir=Path("test_logs"), timezone="America/Halifax" ) try: scraper = scraper_class(config) # Fetch with appropriate method if name == "YouTube": items = scraper.fetch_channel_videos(max_videos=limit) elif name == "Instagram": posts = scraper.fetch_posts(max_posts=limit) stories = scraper.fetch_stories()[:1] # Just try 1 story items = posts + stories elif name == "TikTok": # TikTok is async, let's use fetch_content wrapper items = scraper.fetch_content() items = items[:limit] if items else [] else: # WordPress and RSS scrapers items = scraper.fetch_content() items = items[:limit] if items else [] if items: print(f"✅ SUCCESS: Fetched {len(items)} items") # Show first item if items: first = items[0] print(f"\nFirst item preview:") # Show key fields for key in ['title', 'description', 'caption', 'author', 'channel', 'date', 'publish_date', 'link', 'url']: if key in first: value = str(first[key])[:100] if value: print(f" {key}: {value}") else: print(f"❌ FAILED: No items fetched") return False return True except Exception as e: print(f"❌ ERROR: {e}") import traceback traceback.print_exc() return False def main(): # Load environment load_dotenv() print("\n" + "#"*50) print("# TESTING ALL SOURCES - Simple Connection Test") print("#"*50) results = {} # Test each source if os.getenv('WORDPRESS_API_URL'): results['WordPress'] = test_source(WordPressScraper, "WordPress") if os.getenv('MAILCHIMP_RSS_URL'): results['MailChimp'] = test_source(RSSScraperMailChimp, "MailChimp") if os.getenv('PODCAST_RSS_URL'): results['Podcast'] = test_source(RSSScraperPodcast, "Podcast") if os.getenv('YOUTUBE_CHANNEL_URL'): results['YouTube'] = test_source(YouTubeScraper, "YouTube") if os.getenv('INSTAGRAM_USERNAME'): results['Instagram'] = test_source(InstagramScraper, "Instagram") if os.getenv('TIKTOK_USERNAME'): print("\n⚠️ TikTok requires Playwright browser automation") print(" This may take longer and could be blocked") results['TikTok'] = test_source(TikTokScraper, "TikTok", limit=2) # Summary print("\n" + "="*50) print("SUMMARY") print("="*50) for source, success in results.items(): status = "✅" if success else "❌" print(f"{status} {source}") total = len(results) passed = sum(1 for s in results.values() if s) print(f"\nTotal: {passed}/{total} sources working") if __name__ == "__main__": main()