#!/usr/bin/env python3 """ Test script to verify image downloading functionality. Tests each scraper with a small number of items. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails from src.instagram_scraper_with_images import InstagramScraperWithImages from src.rss_scraper_with_images import RSSScraperPodcastWithImages from src.base_scraper import ScraperConfig from datetime import datetime import pytz import os from dotenv import load_dotenv # Load environment load_dotenv() def test_youtube_thumbnails(): """Test YouTube thumbnail downloads.""" print("\n" + "=" * 60) print("TESTING YOUTUBE THUMBNAIL DOWNLOADS") print("=" * 60) config = ScraperConfig( source_name='YouTube_Test', brand_name='hvacnkowitall', data_dir=Path('test_data/images'), logs_dir=Path('test_logs'), timezone='America/Halifax' ) try: scraper = YouTubeAPIScraperWithThumbnails(config) print("Fetching 3 YouTube videos with thumbnails...") videos = scraper.fetch_content(max_posts=3) if videos: print(f"✅ Fetched {len(videos)} videos") # Check thumbnails for video in videos: if video.get('local_thumbnail'): thumb_path = Path(video['local_thumbnail']) if thumb_path.exists(): size_kb = thumb_path.stat().st_size / 1024 print(f" ✓ {video['title'][:50]}...") print(f" Thumbnail: {thumb_path.name} ({size_kb:.1f} KB)") else: print(f" ✗ {video['title'][:50]}... - thumbnail file missing") else: print(f" ✗ {video['title'][:50]}... - no thumbnail downloaded") # Save sample markdown markdown = scraper.format_markdown(videos) output_file = Path('test_data/images/youtube_test.md') output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(markdown, encoding='utf-8') print(f"\nMarkdown saved to: {output_file}") return True else: print("❌ No videos fetched") return False except Exception as e: print(f"❌ Error: {e}") import traceback traceback.print_exc() return False def test_instagram_images(): """Test Instagram image downloads.""" print("\n" + "=" * 60) print("TESTING INSTAGRAM IMAGE DOWNLOADS") print("=" * 60) if not os.getenv('INSTAGRAM_USERNAME'): print("⚠️ Instagram not configured - skipping") return False config = ScraperConfig( source_name='Instagram_Test', brand_name='hvacnkowitall', data_dir=Path('test_data/images'), logs_dir=Path('test_logs'), timezone='America/Halifax' ) try: scraper = InstagramScraperWithImages(config) print("Fetching 3 Instagram posts with images...") items = scraper.fetch_content(max_posts=3) if items: print(f"✅ Fetched {len(items)} posts") # Check images total_images = 0 for item in items: images = item.get('local_images', []) total_images += len(images) if images: print(f" ✓ Post {item['id']}: {len(images)} image(s)") for img_path in images: path = Path(img_path) if path.exists(): size_kb = path.stat().st_size / 1024 print(f" - {path.name} ({size_kb:.1f} KB)") else: if item.get('is_video'): print(f" ℹ Post {item['id']}: Video post (thumbnail only)") else: print(f" ✗ Post {item['id']}: No images downloaded") print(f"\nTotal images downloaded: {total_images}") # Save sample markdown markdown = scraper.format_markdown(items) output_file = Path('test_data/images/instagram_test.md') output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(markdown, encoding='utf-8') print(f"Markdown saved to: {output_file}") return True else: print("❌ No posts fetched") return False except Exception as e: print(f"❌ Error: {e}") import traceback traceback.print_exc() return False def test_podcast_thumbnails(): """Test Podcast thumbnail downloads.""" print("\n" + "=" * 60) print("TESTING PODCAST THUMBNAIL DOWNLOADS") print("=" * 60) if not os.getenv('PODCAST_RSS_URL'): print("⚠️ Podcast not configured - skipping") return False config = ScraperConfig( source_name='Podcast_Test', brand_name='hvacnkowitall', data_dir=Path('test_data/images'), logs_dir=Path('test_logs'), timezone='America/Halifax' ) try: scraper = RSSScraperPodcastWithImages(config) print("Fetching 3 podcast episodes with thumbnails...") items = scraper.fetch_content(max_items=3) if items: print(f"✅ Fetched {len(items)} episodes") # Check thumbnails for item in items: title = item.get('title', 'Unknown')[:50] if item.get('local_thumbnail'): thumb_path = Path(item['local_thumbnail']) if thumb_path.exists(): size_kb = thumb_path.stat().st_size / 1024 print(f" ✓ {title}...") print(f" Thumbnail: {thumb_path.name} ({size_kb:.1f} KB)") else: print(f" ✗ {title}... - thumbnail file missing") else: print(f" ✗ {title}... - no thumbnail downloaded") # Save sample markdown markdown = scraper.format_markdown(items) output_file = Path('test_data/images/podcast_test.md') output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(markdown, encoding='utf-8') print(f"\nMarkdown saved to: {output_file}") return True else: print("❌ No episodes fetched") return False except Exception as e: print(f"❌ Error: {e}") import traceback traceback.print_exc() return False def check_media_directories(): """Check media directory structure.""" print("\n" + "=" * 60) print("MEDIA DIRECTORY STRUCTURE") print("=" * 60) test_media = Path('test_data/images/media') if test_media.exists(): print(f"Media directory: {test_media}") for source_dir in sorted(test_media.glob('*')): if source_dir.is_dir(): images = list(source_dir.glob('*.jpg')) + \ list(source_dir.glob('*.jpeg')) + \ list(source_dir.glob('*.png')) + \ list(source_dir.glob('*.gif')) if images: total_size = sum(img.stat().st_size for img in images) / (1024 * 1024) # MB print(f" {source_dir.name}/: {len(images)} images ({total_size:.1f} MB)") # Show first 3 images for img in images[:3]: size_kb = img.stat().st_size / 1024 print(f" - {img.name} ({size_kb:.1f} KB)") if len(images) > 3: print(f" ... and {len(images) - 3} more") else: print("No test media directory found") def main(): """Run all tests.""" print("=" * 70) print("TESTING IMAGE DOWNLOAD FUNCTIONALITY") print("=" * 70) print("This will test downloading thumbnails and images from all sources") print("(YouTube thumbnails, Instagram images, Podcast thumbnails)") print() results = {} # Test YouTube results['YouTube'] = test_youtube_thumbnails() # Test Instagram results['Instagram'] = test_instagram_images() # Test Podcast results['Podcast'] = test_podcast_thumbnails() # Check media directories check_media_directories() # Summary print("\n" + "=" * 60) print("TEST SUMMARY") print("=" * 60) for source, success in results.items(): status = "✅ PASSED" if success else "❌ FAILED" print(f"{source:15} {status}") passed = sum(1 for s in results.values() if s) total = len(results) print(f"\nTotal: {passed}/{total} passed") if passed == total: print("\n✅ All tests passed! Ready for production.") else: print("\n⚠️ Some tests failed. Check the errors above.") if __name__ == "__main__": main()