- Add optional individual video page fetching for complete captions - Implement profile scrolling to discover more videos (27+ vs 18) - Add configurable rate limiting and anti-detection delays - Fix RSS scrapers to support max_items parameter for backlog fetching - Add fetch_captions parameter with max_caption_fetches limit - Include additional metadata extraction (likes, comments, shares, duration) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
268 lines
No EOL
10 KiB
Python
Executable file
268 lines
No EOL
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Real-world testing script for all scrapers.
|
|
Tests both recent posts and backlog fetching with actual data.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import argparse
|
|
from dotenv import load_dotenv
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.base_scraper import ScraperConfig
|
|
from src.wordpress_scraper import WordPressScraper
|
|
from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
|
|
from src.youtube_scraper import YouTubeScraper
|
|
from src.instagram_scraper import InstagramScraper
|
|
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
|
|
|
|
|
|
def test_scraper(scraper_class, scraper_name, max_items=3, test_type="recent"):
|
|
"""Test a single scraper with real data."""
|
|
print(f"\n{'='*60}")
|
|
print(f"Testing {scraper_name} - {test_type} ({max_items} items)")
|
|
print('='*60)
|
|
|
|
# Create test directories
|
|
test_data_dir = Path(f"test_data/{test_type}")
|
|
test_logs_dir = Path(f"test_logs/{test_type}")
|
|
|
|
config = ScraperConfig(
|
|
source_name=scraper_name.lower().replace(" ", "_"),
|
|
brand_name="hvacknowitall",
|
|
data_dir=test_data_dir,
|
|
logs_dir=test_logs_dir,
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
try:
|
|
# Initialize scraper
|
|
scraper = scraper_class(config)
|
|
|
|
# For backlog testing, clear state to fetch all items
|
|
if test_type == "backlog":
|
|
if scraper.state_file.exists():
|
|
scraper.state_file.unlink()
|
|
print(f"Cleared state for {scraper_name} backlog testing")
|
|
|
|
# Fetch content with limit
|
|
print(f"Fetching content from {scraper_name}...")
|
|
start_time = time.time()
|
|
|
|
# For scrapers that support max_items parameter
|
|
if scraper_name in ["YouTube", "Instagram", "TikTok"]:
|
|
if scraper_name == "YouTube":
|
|
items = scraper.fetch_channel_videos(max_videos=max_items)
|
|
elif scraper_name == "Instagram":
|
|
items = scraper.fetch_content(max_posts=max_items)
|
|
elif scraper_name == "TikTok":
|
|
# For TikTok, optionally fetch captions (only in backlog mode for testing)
|
|
fetch_captions = (test_type == "backlog" and max_items <= 5)
|
|
if fetch_captions:
|
|
print(f" Note: Fetching captions for up to {min(max_items, 3)} videos...")
|
|
items = scraper.fetch_content(
|
|
max_posts=max_items,
|
|
fetch_captions=fetch_captions,
|
|
max_caption_fetches=min(max_items, 3) # Limit to 3 for testing
|
|
)
|
|
else:
|
|
# For RSS and WordPress scrapers - all now support max_items
|
|
items = scraper.fetch_content(max_items=max_items)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
if not items:
|
|
print(f"❌ No items fetched from {scraper_name}")
|
|
return False
|
|
|
|
print(f"✅ Fetched {len(items)} items in {elapsed:.2f} seconds")
|
|
|
|
# Format as markdown
|
|
markdown = scraper.format_markdown(items)
|
|
|
|
# Save to test file
|
|
output_file = test_data_dir / f"{scraper_name.lower()}_{test_type}_test.md"
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(markdown)
|
|
|
|
print(f"✅ Saved to {output_file}")
|
|
|
|
# Display summary
|
|
print(f"\nSummary for {scraper_name}:")
|
|
print(f" - Items fetched: {len(items)}")
|
|
print(f" - Time taken: {elapsed:.2f}s")
|
|
print(f" - Output size: {len(markdown)} characters")
|
|
|
|
# Display first item details
|
|
if items:
|
|
first_item = items[0]
|
|
print(f"\nFirst item preview:")
|
|
|
|
# Display relevant fields based on scraper type
|
|
if 'title' in first_item:
|
|
title = first_item.get('title', 'N/A')
|
|
# Handle WordPress nested title structure
|
|
if isinstance(title, dict):
|
|
title = title.get('rendered', 'N/A')
|
|
print(f" Title: {str(title)[:80]}")
|
|
if 'description' in first_item:
|
|
desc = first_item.get('description', 'N/A')
|
|
if desc:
|
|
print(f" Description: {desc[:80]}...")
|
|
if 'caption' in first_item:
|
|
caption = first_item.get('caption', 'N/A')
|
|
if caption:
|
|
print(f" Caption: {caption[:80]}...")
|
|
if 'author' in first_item:
|
|
print(f" Author: {first_item.get('author', 'N/A')}")
|
|
if 'channel' in first_item:
|
|
print(f" Channel: {first_item.get('channel', 'N/A')}")
|
|
if 'publish_date' in first_item:
|
|
print(f" Date: {first_item.get('publish_date', 'N/A')}")
|
|
elif 'date' in first_item:
|
|
print(f" Date: {first_item.get('date', 'N/A')}")
|
|
if 'link' in first_item:
|
|
print(f" Link: {first_item.get('link', 'N/A')[:80]}")
|
|
elif 'url' in first_item:
|
|
print(f" URL: {first_item.get('url', 'N/A')[:80]}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error testing {scraper_name}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
def run_all_tests(max_items=3, test_type="recent"):
|
|
"""Run tests for all configured scrapers."""
|
|
print(f"\n{'#'*60}")
|
|
print(f"# Running {test_type} tests with {max_items} items per source")
|
|
print(f"{'#'*60}")
|
|
|
|
results = {}
|
|
|
|
# Test WordPress
|
|
if os.getenv('WORDPRESS_API_URL'):
|
|
print("\n🔧 Testing WordPress Scraper")
|
|
results['WordPress'] = test_scraper(WordPressScraper, "WordPress", max_items, test_type)
|
|
else:
|
|
print("\n⚠️ WordPress not configured (WORDPRESS_API_URL missing)")
|
|
|
|
# Test MailChimp RSS
|
|
if os.getenv('MAILCHIMP_RSS_URL'):
|
|
print("\n🔧 Testing MailChimp RSS Scraper")
|
|
results['MailChimp'] = test_scraper(RSSScraperMailChimp, "MailChimp", max_items, test_type)
|
|
else:
|
|
print("\n⚠️ MailChimp RSS not configured (MAILCHIMP_RSS_URL missing)")
|
|
|
|
# Test Podcast RSS
|
|
if os.getenv('PODCAST_RSS_URL'):
|
|
print("\n🔧 Testing Podcast RSS Scraper")
|
|
results['Podcast'] = test_scraper(RSSScraperPodcast, "Podcast", max_items, test_type)
|
|
else:
|
|
print("\n⚠️ Podcast RSS not configured (PODCAST_RSS_URL missing)")
|
|
|
|
# Test YouTube
|
|
if os.getenv('YOUTUBE_CHANNEL_URL'):
|
|
print("\n🔧 Testing YouTube Scraper")
|
|
results['YouTube'] = test_scraper(YouTubeScraper, "YouTube", max_items, test_type)
|
|
else:
|
|
print("\n⚠️ YouTube not configured (YOUTUBE_CHANNEL_URL missing)")
|
|
|
|
# Test Instagram
|
|
if os.getenv('INSTAGRAM_USERNAME'):
|
|
print("\n🔧 Testing Instagram Scraper")
|
|
print("⚠️ Note: Instagram may require manual login or rate limiting")
|
|
results['Instagram'] = test_scraper(InstagramScraper, "Instagram", max_items, test_type)
|
|
else:
|
|
print("\n⚠️ Instagram not configured (INSTAGRAM_USERNAME missing)")
|
|
|
|
# Test TikTok
|
|
if os.getenv('TIKTOK_USERNAME'):
|
|
print("\n🔧 Testing TikTok Scraper (Advanced with Headed Browser)")
|
|
print("⚠️ Note: TikTok will open a browser window on DISPLAY=:0")
|
|
results['TikTok'] = test_scraper(TikTokScraperAdvanced, "TikTok", max_items, test_type)
|
|
else:
|
|
print("\n⚠️ TikTok not configured (TIKTOK_USERNAME missing)")
|
|
|
|
# Print summary
|
|
print(f"\n{'='*60}")
|
|
print(f"TEST SUMMARY - {test_type} ({max_items} items)")
|
|
print('='*60)
|
|
|
|
for scraper, success in results.items():
|
|
status = "✅ PASSED" if success else "❌ FAILED"
|
|
print(f"{scraper:15} {status}")
|
|
|
|
total = len(results)
|
|
passed = sum(1 for s in results.values() if s)
|
|
print(f"\nTotal: {passed}/{total} passed")
|
|
|
|
return all(results.values())
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description="Test scrapers with real data")
|
|
parser.add_argument('--items', type=int, default=3,
|
|
help='Number of items to fetch per source (default: 3)')
|
|
parser.add_argument('--type', choices=['recent', 'backlog', 'both'], default='recent',
|
|
help='Test type: recent posts, backlog, or both (default: recent)')
|
|
parser.add_argument('--source', type=str, default=None,
|
|
help='Test specific source only (wordpress, mailchimp, podcast, youtube, instagram, tiktok)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Determine which tests to run
|
|
test_types = []
|
|
if args.type == 'both':
|
|
test_types = ['recent', 'backlog']
|
|
else:
|
|
test_types = [args.type]
|
|
|
|
all_passed = True
|
|
|
|
for test_type in test_types:
|
|
if args.source:
|
|
# Test specific source
|
|
source_map = {
|
|
'wordpress': (WordPressScraper, "WordPress"),
|
|
'mailchimp': (RSSScraperMailChimp, "MailChimp"),
|
|
'podcast': (RSSScraperPodcast, "Podcast"),
|
|
'youtube': (YouTubeScraper, "YouTube"),
|
|
'instagram': (InstagramScraper, "Instagram"),
|
|
'tiktok': (TikTokScraperAdvanced, "TikTok")
|
|
}
|
|
|
|
if args.source.lower() in source_map:
|
|
scraper_class, scraper_name = source_map[args.source.lower()]
|
|
success = test_scraper(scraper_class, scraper_name, args.items, test_type)
|
|
all_passed = all_passed and success
|
|
else:
|
|
print(f"Unknown source: {args.source}")
|
|
all_passed = False
|
|
else:
|
|
# Test all sources
|
|
success = run_all_tests(args.items, test_type)
|
|
all_passed = all_passed and success
|
|
|
|
# Exit with appropriate code
|
|
sys.exit(0 if all_passed else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |