Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
280 lines
No EOL
9.3 KiB
Python
280 lines
No EOL
9.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Test script to verify image downloading functionality.
|
||
Tests each scraper with a small number of items.
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
||
from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails
|
||
from src.instagram_scraper_with_images import InstagramScraperWithImages
|
||
from src.rss_scraper_with_images import RSSScraperPodcastWithImages
|
||
from src.base_scraper import ScraperConfig
|
||
from datetime import datetime
|
||
import pytz
|
||
import os
|
||
from dotenv import load_dotenv
|
||
|
||
# Load environment
|
||
load_dotenv()
|
||
|
||
|
||
def test_youtube_thumbnails():
|
||
"""Test YouTube thumbnail downloads."""
|
||
print("\n" + "=" * 60)
|
||
print("TESTING YOUTUBE THUMBNAIL DOWNLOADS")
|
||
print("=" * 60)
|
||
|
||
config = ScraperConfig(
|
||
source_name='YouTube_Test',
|
||
brand_name='hvacnkowitall',
|
||
data_dir=Path('test_data/images'),
|
||
logs_dir=Path('test_logs'),
|
||
timezone='America/Halifax'
|
||
)
|
||
|
||
try:
|
||
scraper = YouTubeAPIScraperWithThumbnails(config)
|
||
print("Fetching 3 YouTube videos with thumbnails...")
|
||
|
||
videos = scraper.fetch_content(max_posts=3)
|
||
|
||
if videos:
|
||
print(f"✅ Fetched {len(videos)} videos")
|
||
|
||
# Check thumbnails
|
||
for video in videos:
|
||
if video.get('local_thumbnail'):
|
||
thumb_path = Path(video['local_thumbnail'])
|
||
if thumb_path.exists():
|
||
size_kb = thumb_path.stat().st_size / 1024
|
||
print(f" ✓ {video['title'][:50]}...")
|
||
print(f" Thumbnail: {thumb_path.name} ({size_kb:.1f} KB)")
|
||
else:
|
||
print(f" ✗ {video['title'][:50]}... - thumbnail file missing")
|
||
else:
|
||
print(f" ✗ {video['title'][:50]}... - no thumbnail downloaded")
|
||
|
||
# Save sample markdown
|
||
markdown = scraper.format_markdown(videos)
|
||
output_file = Path('test_data/images/youtube_test.md')
|
||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||
output_file.write_text(markdown, encoding='utf-8')
|
||
print(f"\nMarkdown saved to: {output_file}")
|
||
|
||
return True
|
||
else:
|
||
print("❌ No videos fetched")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
|
||
def test_instagram_images():
|
||
"""Test Instagram image downloads."""
|
||
print("\n" + "=" * 60)
|
||
print("TESTING INSTAGRAM IMAGE DOWNLOADS")
|
||
print("=" * 60)
|
||
|
||
if not os.getenv('INSTAGRAM_USERNAME'):
|
||
print("⚠️ Instagram not configured - skipping")
|
||
return False
|
||
|
||
config = ScraperConfig(
|
||
source_name='Instagram_Test',
|
||
brand_name='hvacnkowitall',
|
||
data_dir=Path('test_data/images'),
|
||
logs_dir=Path('test_logs'),
|
||
timezone='America/Halifax'
|
||
)
|
||
|
||
try:
|
||
scraper = InstagramScraperWithImages(config)
|
||
print("Fetching 3 Instagram posts with images...")
|
||
|
||
items = scraper.fetch_content(max_posts=3)
|
||
|
||
if items:
|
||
print(f"✅ Fetched {len(items)} posts")
|
||
|
||
# Check images
|
||
total_images = 0
|
||
for item in items:
|
||
images = item.get('local_images', [])
|
||
total_images += len(images)
|
||
|
||
if images:
|
||
print(f" ✓ Post {item['id']}: {len(images)} image(s)")
|
||
for img_path in images:
|
||
path = Path(img_path)
|
||
if path.exists():
|
||
size_kb = path.stat().st_size / 1024
|
||
print(f" - {path.name} ({size_kb:.1f} KB)")
|
||
else:
|
||
if item.get('is_video'):
|
||
print(f" ℹ Post {item['id']}: Video post (thumbnail only)")
|
||
else:
|
||
print(f" ✗ Post {item['id']}: No images downloaded")
|
||
|
||
print(f"\nTotal images downloaded: {total_images}")
|
||
|
||
# Save sample markdown
|
||
markdown = scraper.format_markdown(items)
|
||
output_file = Path('test_data/images/instagram_test.md')
|
||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||
output_file.write_text(markdown, encoding='utf-8')
|
||
print(f"Markdown saved to: {output_file}")
|
||
|
||
return True
|
||
else:
|
||
print("❌ No posts fetched")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
|
||
def test_podcast_thumbnails():
|
||
"""Test Podcast thumbnail downloads."""
|
||
print("\n" + "=" * 60)
|
||
print("TESTING PODCAST THUMBNAIL DOWNLOADS")
|
||
print("=" * 60)
|
||
|
||
if not os.getenv('PODCAST_RSS_URL'):
|
||
print("⚠️ Podcast not configured - skipping")
|
||
return False
|
||
|
||
config = ScraperConfig(
|
||
source_name='Podcast_Test',
|
||
brand_name='hvacnkowitall',
|
||
data_dir=Path('test_data/images'),
|
||
logs_dir=Path('test_logs'),
|
||
timezone='America/Halifax'
|
||
)
|
||
|
||
try:
|
||
scraper = RSSScraperPodcastWithImages(config)
|
||
print("Fetching 3 podcast episodes with thumbnails...")
|
||
|
||
items = scraper.fetch_content(max_items=3)
|
||
|
||
if items:
|
||
print(f"✅ Fetched {len(items)} episodes")
|
||
|
||
# Check thumbnails
|
||
for item in items:
|
||
title = item.get('title', 'Unknown')[:50]
|
||
if item.get('local_thumbnail'):
|
||
thumb_path = Path(item['local_thumbnail'])
|
||
if thumb_path.exists():
|
||
size_kb = thumb_path.stat().st_size / 1024
|
||
print(f" ✓ {title}...")
|
||
print(f" Thumbnail: {thumb_path.name} ({size_kb:.1f} KB)")
|
||
else:
|
||
print(f" ✗ {title}... - thumbnail file missing")
|
||
else:
|
||
print(f" ✗ {title}... - no thumbnail downloaded")
|
||
|
||
# Save sample markdown
|
||
markdown = scraper.format_markdown(items)
|
||
output_file = Path('test_data/images/podcast_test.md')
|
||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||
output_file.write_text(markdown, encoding='utf-8')
|
||
print(f"\nMarkdown saved to: {output_file}")
|
||
|
||
return True
|
||
else:
|
||
print("❌ No episodes fetched")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
|
||
def check_media_directories():
|
||
"""Check media directory structure."""
|
||
print("\n" + "=" * 60)
|
||
print("MEDIA DIRECTORY STRUCTURE")
|
||
print("=" * 60)
|
||
|
||
test_media = Path('test_data/images/media')
|
||
if test_media.exists():
|
||
print(f"Media directory: {test_media}")
|
||
|
||
for source_dir in sorted(test_media.glob('*')):
|
||
if source_dir.is_dir():
|
||
images = list(source_dir.glob('*.jpg')) + \
|
||
list(source_dir.glob('*.jpeg')) + \
|
||
list(source_dir.glob('*.png')) + \
|
||
list(source_dir.glob('*.gif'))
|
||
|
||
if images:
|
||
total_size = sum(img.stat().st_size for img in images) / (1024 * 1024) # MB
|
||
print(f" {source_dir.name}/: {len(images)} images ({total_size:.1f} MB)")
|
||
|
||
# Show first 3 images
|
||
for img in images[:3]:
|
||
size_kb = img.stat().st_size / 1024
|
||
print(f" - {img.name} ({size_kb:.1f} KB)")
|
||
if len(images) > 3:
|
||
print(f" ... and {len(images) - 3} more")
|
||
else:
|
||
print("No test media directory found")
|
||
|
||
|
||
def main():
|
||
"""Run all tests."""
|
||
print("=" * 70)
|
||
print("TESTING IMAGE DOWNLOAD FUNCTIONALITY")
|
||
print("=" * 70)
|
||
print("This will test downloading thumbnails and images from all sources")
|
||
print("(YouTube thumbnails, Instagram images, Podcast thumbnails)")
|
||
print()
|
||
|
||
results = {}
|
||
|
||
# Test YouTube
|
||
results['YouTube'] = test_youtube_thumbnails()
|
||
|
||
# Test Instagram
|
||
results['Instagram'] = test_instagram_images()
|
||
|
||
# Test Podcast
|
||
results['Podcast'] = test_podcast_thumbnails()
|
||
|
||
# Check media directories
|
||
check_media_directories()
|
||
|
||
# Summary
|
||
print("\n" + "=" * 60)
|
||
print("TEST SUMMARY")
|
||
print("=" * 60)
|
||
|
||
for source, success in results.items():
|
||
status = "✅ PASSED" if success else "❌ FAILED"
|
||
print(f"{source:15} {status}")
|
||
|
||
passed = sum(1 for s in results.values() if s)
|
||
total = len(results)
|
||
print(f"\nTotal: {passed}/{total} passed")
|
||
|
||
if passed == total:
|
||
print("\n✅ All tests passed! Ready for production.")
|
||
else:
|
||
print("\n⚠️ Some tests failed. Check the errors above.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |