- Add optional individual video page fetching for complete captions - Implement profile scrolling to discover more videos (27+ vs 18) - Add configurable rate limiting and anti-detection delays - Fix RSS scrapers to support max_items parameter for backlog fetching - Add fetch_captions parameter with max_caption_fetches limit - Include additional metadata extraction (likes, comments, shares, duration) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			268 lines
		
	
	
		
			No EOL
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			268 lines
		
	
	
		
			No EOL
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| Real-world testing script for all scrapers.
 | |
| Tests both recent posts and backlog fetching with actual data.
 | |
| """
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| import json
 | |
| import time
 | |
| from pathlib import Path
 | |
| from datetime import datetime
 | |
| import argparse
 | |
| from dotenv import load_dotenv
 | |
| 
 | |
| # Add src to path
 | |
| sys.path.insert(0, str(Path(__file__).parent))
 | |
| 
 | |
| from src.base_scraper import ScraperConfig
 | |
| from src.wordpress_scraper import WordPressScraper
 | |
| from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
 | |
| from src.youtube_scraper import YouTubeScraper
 | |
| from src.instagram_scraper import InstagramScraper
 | |
| from src.tiktok_scraper_advanced import TikTokScraperAdvanced
 | |
| 
 | |
| 
 | |
| def test_scraper(scraper_class, scraper_name, max_items=3, test_type="recent"):
 | |
|     """Test a single scraper with real data."""
 | |
|     print(f"\n{'='*60}")
 | |
|     print(f"Testing {scraper_name} - {test_type} ({max_items} items)")
 | |
|     print('='*60)
 | |
|     
 | |
|     # Create test directories
 | |
|     test_data_dir = Path(f"test_data/{test_type}")
 | |
|     test_logs_dir = Path(f"test_logs/{test_type}")
 | |
|     
 | |
|     config = ScraperConfig(
 | |
|         source_name=scraper_name.lower().replace(" ", "_"),
 | |
|         brand_name="hvacknowitall",
 | |
|         data_dir=test_data_dir,
 | |
|         logs_dir=test_logs_dir,
 | |
|         timezone="America/Halifax"
 | |
|     )
 | |
|     
 | |
|     try:
 | |
|         # Initialize scraper
 | |
|         scraper = scraper_class(config)
 | |
|         
 | |
|         # For backlog testing, clear state to fetch all items
 | |
|         if test_type == "backlog":
 | |
|             if scraper.state_file.exists():
 | |
|                 scraper.state_file.unlink()
 | |
|                 print(f"Cleared state for {scraper_name} backlog testing")
 | |
|         
 | |
|         # Fetch content with limit
 | |
|         print(f"Fetching content from {scraper_name}...")
 | |
|         start_time = time.time()
 | |
|         
 | |
|         # For scrapers that support max_items parameter
 | |
|         if scraper_name in ["YouTube", "Instagram", "TikTok"]:
 | |
|             if scraper_name == "YouTube":
 | |
|                 items = scraper.fetch_channel_videos(max_videos=max_items)
 | |
|             elif scraper_name == "Instagram":
 | |
|                 items = scraper.fetch_content(max_posts=max_items)
 | |
|             elif scraper_name == "TikTok":
 | |
|                 # For TikTok, optionally fetch captions (only in backlog mode for testing)
 | |
|                 fetch_captions = (test_type == "backlog" and max_items <= 5)
 | |
|                 if fetch_captions:
 | |
|                     print(f"  Note: Fetching captions for up to {min(max_items, 3)} videos...")
 | |
|                 items = scraper.fetch_content(
 | |
|                     max_posts=max_items, 
 | |
|                     fetch_captions=fetch_captions,
 | |
|                     max_caption_fetches=min(max_items, 3)  # Limit to 3 for testing
 | |
|                 )
 | |
|         else:
 | |
|             # For RSS and WordPress scrapers - all now support max_items
 | |
|             items = scraper.fetch_content(max_items=max_items)
 | |
|         
 | |
|         elapsed = time.time() - start_time
 | |
|         
 | |
|         if not items:
 | |
|             print(f"❌ No items fetched from {scraper_name}")
 | |
|             return False
 | |
|         
 | |
|         print(f"✅ Fetched {len(items)} items in {elapsed:.2f} seconds")
 | |
|         
 | |
|         # Format as markdown
 | |
|         markdown = scraper.format_markdown(items)
 | |
|         
 | |
|         # Save to test file
 | |
|         output_file = test_data_dir / f"{scraper_name.lower()}_{test_type}_test.md"
 | |
|         output_file.parent.mkdir(parents=True, exist_ok=True)
 | |
|         
 | |
|         with open(output_file, 'w', encoding='utf-8') as f:
 | |
|             f.write(markdown)
 | |
|         
 | |
|         print(f"✅ Saved to {output_file}")
 | |
|         
 | |
|         # Display summary
 | |
|         print(f"\nSummary for {scraper_name}:")
 | |
|         print(f"  - Items fetched: {len(items)}")
 | |
|         print(f"  - Time taken: {elapsed:.2f}s")
 | |
|         print(f"  - Output size: {len(markdown)} characters")
 | |
|         
 | |
|         # Display first item details
 | |
|         if items:
 | |
|             first_item = items[0]
 | |
|             print(f"\nFirst item preview:")
 | |
|             
 | |
|             # Display relevant fields based on scraper type
 | |
|             if 'title' in first_item:
 | |
|                 title = first_item.get('title', 'N/A')
 | |
|                 # Handle WordPress nested title structure
 | |
|                 if isinstance(title, dict):
 | |
|                     title = title.get('rendered', 'N/A')
 | |
|                 print(f"  Title: {str(title)[:80]}")
 | |
|             if 'description' in first_item:
 | |
|                 desc = first_item.get('description', 'N/A')
 | |
|                 if desc:
 | |
|                     print(f"  Description: {desc[:80]}...")
 | |
|             if 'caption' in first_item:
 | |
|                 caption = first_item.get('caption', 'N/A')
 | |
|                 if caption:
 | |
|                     print(f"  Caption: {caption[:80]}...")
 | |
|             if 'author' in first_item:
 | |
|                 print(f"  Author: {first_item.get('author', 'N/A')}")
 | |
|             if 'channel' in first_item:
 | |
|                 print(f"  Channel: {first_item.get('channel', 'N/A')}")
 | |
|             if 'publish_date' in first_item:
 | |
|                 print(f"  Date: {first_item.get('publish_date', 'N/A')}")
 | |
|             elif 'date' in first_item:
 | |
|                 print(f"  Date: {first_item.get('date', 'N/A')}")
 | |
|             if 'link' in first_item:
 | |
|                 print(f"  Link: {first_item.get('link', 'N/A')[:80]}")
 | |
|             elif 'url' in first_item:
 | |
|                 print(f"  URL: {first_item.get('url', 'N/A')[:80]}")
 | |
|         
 | |
|         return True
 | |
|         
 | |
|     except Exception as e:
 | |
|         print(f"❌ Error testing {scraper_name}: {e}")
 | |
|         import traceback
 | |
|         traceback.print_exc()
 | |
|         return False
 | |
| 
 | |
| 
 | |
| def run_all_tests(max_items=3, test_type="recent"):
 | |
|     """Run tests for all configured scrapers."""
 | |
|     print(f"\n{'#'*60}")
 | |
|     print(f"# Running {test_type} tests with {max_items} items per source")
 | |
|     print(f"{'#'*60}")
 | |
|     
 | |
|     results = {}
 | |
|     
 | |
|     # Test WordPress
 | |
|     if os.getenv('WORDPRESS_API_URL'):
 | |
|         print("\n🔧 Testing WordPress Scraper")
 | |
|         results['WordPress'] = test_scraper(WordPressScraper, "WordPress", max_items, test_type)
 | |
|     else:
 | |
|         print("\n⚠️  WordPress not configured (WORDPRESS_API_URL missing)")
 | |
|     
 | |
|     # Test MailChimp RSS
 | |
|     if os.getenv('MAILCHIMP_RSS_URL'):
 | |
|         print("\n🔧 Testing MailChimp RSS Scraper")
 | |
|         results['MailChimp'] = test_scraper(RSSScraperMailChimp, "MailChimp", max_items, test_type)
 | |
|     else:
 | |
|         print("\n⚠️  MailChimp RSS not configured (MAILCHIMP_RSS_URL missing)")
 | |
|     
 | |
|     # Test Podcast RSS
 | |
|     if os.getenv('PODCAST_RSS_URL'):
 | |
|         print("\n🔧 Testing Podcast RSS Scraper")
 | |
|         results['Podcast'] = test_scraper(RSSScraperPodcast, "Podcast", max_items, test_type)
 | |
|     else:
 | |
|         print("\n⚠️  Podcast RSS not configured (PODCAST_RSS_URL missing)")
 | |
|     
 | |
|     # Test YouTube
 | |
|     if os.getenv('YOUTUBE_CHANNEL_URL'):
 | |
|         print("\n🔧 Testing YouTube Scraper")
 | |
|         results['YouTube'] = test_scraper(YouTubeScraper, "YouTube", max_items, test_type)
 | |
|     else:
 | |
|         print("\n⚠️  YouTube not configured (YOUTUBE_CHANNEL_URL missing)")
 | |
|     
 | |
|     # Test Instagram
 | |
|     if os.getenv('INSTAGRAM_USERNAME'):
 | |
|         print("\n🔧 Testing Instagram Scraper")
 | |
|         print("⚠️  Note: Instagram may require manual login or rate limiting")
 | |
|         results['Instagram'] = test_scraper(InstagramScraper, "Instagram", max_items, test_type)
 | |
|     else:
 | |
|         print("\n⚠️  Instagram not configured (INSTAGRAM_USERNAME missing)")
 | |
|     
 | |
|     # Test TikTok
 | |
|     if os.getenv('TIKTOK_USERNAME'):
 | |
|         print("\n🔧 Testing TikTok Scraper (Advanced with Headed Browser)")
 | |
|         print("⚠️  Note: TikTok will open a browser window on DISPLAY=:0")
 | |
|         results['TikTok'] = test_scraper(TikTokScraperAdvanced, "TikTok", max_items, test_type)
 | |
|     else:
 | |
|         print("\n⚠️  TikTok not configured (TIKTOK_USERNAME missing)")
 | |
|     
 | |
|     # Print summary
 | |
|     print(f"\n{'='*60}")
 | |
|     print(f"TEST SUMMARY - {test_type} ({max_items} items)")
 | |
|     print('='*60)
 | |
|     
 | |
|     for scraper, success in results.items():
 | |
|         status = "✅ PASSED" if success else "❌ FAILED"
 | |
|         print(f"{scraper:15} {status}")
 | |
|     
 | |
|     total = len(results)
 | |
|     passed = sum(1 for s in results.values() if s)
 | |
|     print(f"\nTotal: {passed}/{total} passed")
 | |
|     
 | |
|     return all(results.values())
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     """Main entry point."""
 | |
|     parser = argparse.ArgumentParser(description="Test scrapers with real data")
 | |
|     parser.add_argument('--items', type=int, default=3,
 | |
|                        help='Number of items to fetch per source (default: 3)')
 | |
|     parser.add_argument('--type', choices=['recent', 'backlog', 'both'], default='recent',
 | |
|                        help='Test type: recent posts, backlog, or both (default: recent)')
 | |
|     parser.add_argument('--source', type=str, default=None,
 | |
|                        help='Test specific source only (wordpress, mailchimp, podcast, youtube, instagram, tiktok)')
 | |
|     
 | |
|     args = parser.parse_args()
 | |
|     
 | |
|     # Load environment variables
 | |
|     load_dotenv()
 | |
|     
 | |
|     # Determine which tests to run
 | |
|     test_types = []
 | |
|     if args.type == 'both':
 | |
|         test_types = ['recent', 'backlog']
 | |
|     else:
 | |
|         test_types = [args.type]
 | |
|     
 | |
|     all_passed = True
 | |
|     
 | |
|     for test_type in test_types:
 | |
|         if args.source:
 | |
|             # Test specific source
 | |
|             source_map = {
 | |
|                 'wordpress': (WordPressScraper, "WordPress"),
 | |
|                 'mailchimp': (RSSScraperMailChimp, "MailChimp"),
 | |
|                 'podcast': (RSSScraperPodcast, "Podcast"),
 | |
|                 'youtube': (YouTubeScraper, "YouTube"),
 | |
|                 'instagram': (InstagramScraper, "Instagram"),
 | |
|                 'tiktok': (TikTokScraperAdvanced, "TikTok")
 | |
|             }
 | |
|             
 | |
|             if args.source.lower() in source_map:
 | |
|                 scraper_class, scraper_name = source_map[args.source.lower()]
 | |
|                 success = test_scraper(scraper_class, scraper_name, args.items, test_type)
 | |
|                 all_passed = all_passed and success
 | |
|             else:
 | |
|                 print(f"Unknown source: {args.source}")
 | |
|                 all_passed = False
 | |
|         else:
 | |
|             # Test all sources
 | |
|             success = run_all_tests(args.items, test_type)
 | |
|             all_passed = all_passed and success
 | |
|     
 | |
|     # Exit with appropriate code
 | |
|     sys.exit(0 if all_passed else 1)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main() |