Add Instagram scraper with instaloader and parallel processing orchestrator
- Implement Instagram scraper with aggressive rate limiting
- Add orchestrator for running all scrapers in parallel
- Create comprehensive tests for Instagram scraper (11 tests)
- Create tests for orchestrator (9 tests)
- Fix Instagram test issues with post type detection
- All 60 tests passing successfully
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
			
			
This commit is contained in:
		
							parent
							
								
									c1831d3a52
								
							
						
					
					
						commit
						b89655c829
					
				
					 7 changed files with 1210 additions and 0 deletions
				
			
		
							
								
								
									
										399
									
								
								src/instagram_scraper.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										399
									
								
								src/instagram_scraper.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,399 @@ | |||
| import os | ||||
| import time | ||||
| import random | ||||
| from typing import Any, Dict, List, Optional | ||||
| from datetime import datetime | ||||
| from pathlib import Path | ||||
| import instaloader | ||||
| from src.base_scraper import BaseScraper, ScraperConfig | ||||
| 
 | ||||
| 
 | ||||
| class InstagramScraper(BaseScraper): | ||||
|     """Instagram scraper using instaloader with aggressive rate limiting.""" | ||||
|      | ||||
|     def __init__(self, config: ScraperConfig): | ||||
|         super().__init__(config) | ||||
|         self.username = os.getenv('INSTAGRAM_USERNAME') | ||||
|         self.password = os.getenv('INSTAGRAM_PASSWORD') | ||||
|         self.target_account = os.getenv('INSTAGRAM_TARGET', 'hvacknowitall') | ||||
|          | ||||
|         # Session file for persistence | ||||
|         self.session_file = self.config.data_dir / '.sessions' / f'{self.username}' | ||||
|         self.session_file.parent.mkdir(parents=True, exist_ok=True) | ||||
|          | ||||
|         # Initialize loader | ||||
|         self.loader = self._setup_loader() | ||||
|         self._login() | ||||
|          | ||||
|         # Request counter for rate limiting | ||||
|         self.request_count = 0 | ||||
|         self.max_requests_per_hour = 100 | ||||
| 
 | ||||
|     def _setup_loader(self) -> instaloader.Instaloader: | ||||
|         """Setup Instaloader with conservative settings.""" | ||||
|         loader = instaloader.Instaloader( | ||||
|             quiet=True, | ||||
|             user_agent='Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1', | ||||
|             dirname_pattern=str(self.config.data_dir / 'media' / 'Instagram'), | ||||
|             filename_pattern='{date_utc}_UTC_{shortcode}', | ||||
|             download_pictures=False,  # Don't download by default | ||||
|             download_videos=False, | ||||
|             download_video_thumbnails=False, | ||||
|             download_geotags=False, | ||||
|             download_comments=False, | ||||
|             save_metadata=False, | ||||
|             compress_json=False, | ||||
|             post_metadata_txt_pattern='', | ||||
|             storyitem_metadata_txt_pattern='', | ||||
|             max_connection_attempts=3, | ||||
|             request_timeout=30.0, | ||||
|             rate_controller=lambda x: time.sleep(random.uniform(5, 10))  # Built-in rate limiting | ||||
|         ) | ||||
|         return loader | ||||
| 
 | ||||
|     def _login(self) -> None: | ||||
|         """Login to Instagram or load existing session.""" | ||||
|         try: | ||||
|             # Try to load existing session | ||||
|             if self.session_file.exists(): | ||||
|                 self.loader.load_session_from_file(str(self.session_file), self.username) | ||||
|                 self.logger.info("Loaded existing Instagram session") | ||||
|             else: | ||||
|                 # Login with credentials | ||||
|                 self.logger.info("Logging in to Instagram...") | ||||
|                 self.loader.login(self.username, self.password) | ||||
|                 self.loader.save_session_to_file(str(self.session_file)) | ||||
|                 self.logger.info("Instagram login successful, session saved") | ||||
|                  | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Instagram login error: {e}") | ||||
| 
 | ||||
|     def _aggressive_delay(self, min_seconds: float = 5, max_seconds: float = 10) -> None: | ||||
|         """Add aggressive random delay for Instagram.""" | ||||
|         delay = random.uniform(min_seconds, max_seconds) | ||||
|         self.logger.debug(f"Waiting {delay:.2f} seconds (Instagram rate limiting)...") | ||||
|         time.sleep(delay) | ||||
| 
 | ||||
|     def _check_rate_limit(self) -> None: | ||||
|         """Check and enforce rate limiting.""" | ||||
|         self.request_count += 1 | ||||
|          | ||||
|         if self.request_count >= self.max_requests_per_hour: | ||||
|             self.logger.warning(f"Rate limit reached ({self.max_requests_per_hour} requests), pausing for 1 hour...") | ||||
|             time.sleep(3600)  # Wait 1 hour | ||||
|             self.request_count = 0 | ||||
|         elif self.request_count % 10 == 0: | ||||
|             # Take a longer break every 10 requests | ||||
|             self.logger.info("Taking extended break after 10 requests...") | ||||
|             self._aggressive_delay(30, 60) | ||||
| 
 | ||||
|     def _get_post_type(self, post) -> str: | ||||
|         """Determine post type from Instagram post object.""" | ||||
|         typename = getattr(post, 'typename', '') | ||||
|         is_video = getattr(post, 'is_video', False) | ||||
|          | ||||
|         if typename == 'GraphStoryImage' or typename == 'GraphStoryVideo': | ||||
|             return 'story' | ||||
|         elif 'Video' in typename or is_video: | ||||
|             return 'reel' | ||||
|         else: | ||||
|             return 'post' | ||||
| 
 | ||||
|     def fetch_posts(self, max_posts: int = 20) -> List[Dict[str, Any]]: | ||||
|         """Fetch posts from Instagram profile.""" | ||||
|         posts_data = [] | ||||
|          | ||||
|         try: | ||||
|             self.logger.info(f"Fetching posts from @{self.target_account}") | ||||
|              | ||||
|             # Get profile | ||||
|             profile = instaloader.Profile.from_username(self.loader.context, self.target_account) | ||||
|             self._check_rate_limit() | ||||
|              | ||||
|             # Get posts | ||||
|             posts = profile.get_posts() | ||||
|              | ||||
|             count = 0 | ||||
|             for post in posts: | ||||
|                 if count >= max_posts: | ||||
|                     break | ||||
|                  | ||||
|                 try: | ||||
|                     # Extract post data | ||||
|                     post_data = { | ||||
|                         'id': post.shortcode, | ||||
|                         'type': self._get_post_type(post), | ||||
|                         'caption': post.caption if post.caption else '', | ||||
|                         'author': post.owner_username, | ||||
|                         'publish_date': post.date_utc.isoformat(), | ||||
|                         'link': f'https://www.instagram.com/p/{post.shortcode}/', | ||||
|                         'likes': post.likes, | ||||
|                         'comments': post.comments, | ||||
|                         'views': post.video_view_count if hasattr(post, 'video_view_count') else None, | ||||
|                         'media_count': post.mediacount if hasattr(post, 'mediacount') else 1, | ||||
|                         'hashtags': list(post.caption_hashtags) if post.caption else [], | ||||
|                         'mentions': list(post.caption_mentions) if post.caption else [], | ||||
|                         'is_video': getattr(post, 'is_video', False) | ||||
|                     } | ||||
|                      | ||||
|                     posts_data.append(post_data) | ||||
|                     count += 1 | ||||
|                      | ||||
|                     # Aggressive rate limiting between posts | ||||
|                     self._aggressive_delay() | ||||
|                     self._check_rate_limit() | ||||
|                      | ||||
|                     # Log progress | ||||
|                     if count % 5 == 0: | ||||
|                         self.logger.info(f"Fetched {count}/{max_posts} posts") | ||||
|                      | ||||
|                 except Exception as e: | ||||
|                     self.logger.error(f"Error processing post: {e}") | ||||
|                     continue | ||||
|              | ||||
|             self.logger.info(f"Successfully fetched {len(posts_data)} posts") | ||||
|              | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Error fetching posts: {e}") | ||||
|          | ||||
|         return posts_data | ||||
| 
 | ||||
|     def fetch_stories(self) -> List[Dict[str, Any]]: | ||||
|         """Fetch stories from Instagram profile.""" | ||||
|         stories_data = [] | ||||
|          | ||||
|         try: | ||||
|             self.logger.info(f"Fetching stories from @{self.target_account}") | ||||
|              | ||||
|             # Get profile | ||||
|             profile = instaloader.Profile.from_username(self.loader.context, self.target_account) | ||||
|             self._check_rate_limit() | ||||
|              | ||||
|             # Get user ID for stories | ||||
|             userid = profile.userid | ||||
|              | ||||
|             # Get stories | ||||
|             for story in self.loader.get_stories(userids=[userid]): | ||||
|                 for item in story: | ||||
|                     try: | ||||
|                         story_data = { | ||||
|                             'id': item.mediaid, | ||||
|                             'type': 'story', | ||||
|                             'caption': '',  # Stories usually don't have captions | ||||
|                             'author': item.owner_username, | ||||
|                             'publish_date': item.date_utc.isoformat(), | ||||
|                             'link': f'https://www.instagram.com/stories/{item.owner_username}/{item.mediaid}/', | ||||
|                             'is_video': item.is_video if hasattr(item, 'is_video') else False | ||||
|                         } | ||||
|                          | ||||
|                         stories_data.append(story_data) | ||||
|                          | ||||
|                         # Rate limiting | ||||
|                         self._aggressive_delay() | ||||
|                         self._check_rate_limit() | ||||
|                          | ||||
|                     except Exception as e: | ||||
|                         self.logger.error(f"Error processing story: {e}") | ||||
|                         continue | ||||
|              | ||||
|             self.logger.info(f"Successfully fetched {len(stories_data)} stories") | ||||
|              | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Error fetching stories: {e}") | ||||
|          | ||||
|         return stories_data | ||||
| 
 | ||||
|     def fetch_reels(self, max_reels: int = 10) -> List[Dict[str, Any]]: | ||||
|         """Fetch reels (videos) from Instagram profile.""" | ||||
|         reels_data = [] | ||||
|          | ||||
|         try: | ||||
|             self.logger.info(f"Fetching reels from @{self.target_account}") | ||||
|              | ||||
|             # Get profile | ||||
|             profile = instaloader.Profile.from_username(self.loader.context, self.target_account) | ||||
|             self._check_rate_limit() | ||||
|              | ||||
|             # Get posts and filter for videos/reels | ||||
|             posts = profile.get_posts() | ||||
|              | ||||
|             count = 0 | ||||
|             for post in posts: | ||||
|                 if count >= max_reels: | ||||
|                     break | ||||
|                  | ||||
|                 # Check if it's a video/reel | ||||
|                 if not getattr(post, 'is_video', False): | ||||
|                     continue | ||||
|                  | ||||
|                 try: | ||||
|                     reel_data = { | ||||
|                         'id': post.shortcode, | ||||
|                         'type': 'reel', | ||||
|                         'caption': post.caption if post.caption else '', | ||||
|                         'author': post.owner_username, | ||||
|                         'publish_date': post.date_utc.isoformat(), | ||||
|                         'link': f'https://www.instagram.com/reel/{post.shortcode}/', | ||||
|                         'likes': post.likes, | ||||
|                         'comments': post.comments, | ||||
|                         'views': post.video_view_count if hasattr(post, 'video_view_count') else None, | ||||
|                         'duration': post.video_duration if hasattr(post, 'video_duration') else None, | ||||
|                         'hashtags': list(post.caption_hashtags) if post.caption else [], | ||||
|                         'mentions': list(post.caption_mentions) if post.caption else [] | ||||
|                     } | ||||
|                      | ||||
|                     reels_data.append(reel_data) | ||||
|                     count += 1 | ||||
|                      | ||||
|                     # Aggressive rate limiting | ||||
|                     self._aggressive_delay() | ||||
|                     self._check_rate_limit() | ||||
|                      | ||||
|                 except Exception as e: | ||||
|                     self.logger.error(f"Error processing reel: {e}") | ||||
|                     continue | ||||
|              | ||||
|             self.logger.info(f"Successfully fetched {len(reels_data)} reels") | ||||
|              | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Error fetching reels: {e}") | ||||
|          | ||||
|         return reels_data | ||||
| 
 | ||||
|     def fetch_content(self) -> List[Dict[str, Any]]: | ||||
|         """Fetch all content types from Instagram.""" | ||||
|         all_content = [] | ||||
|          | ||||
|         # Fetch posts | ||||
|         posts = self.fetch_posts(max_posts=20) | ||||
|         all_content.extend(posts) | ||||
|          | ||||
|         # Take a break between content types | ||||
|         self.logger.info("Taking break before fetching stories...") | ||||
|         self._aggressive_delay(15, 30) | ||||
|          | ||||
|         # Fetch stories | ||||
|         stories = self.fetch_stories() | ||||
|         all_content.extend(stories) | ||||
|          | ||||
|         # Note: Reels are included in posts (videos) | ||||
|         # so we don't need to fetch them separately | ||||
|          | ||||
|         self.logger.info(f"Total content fetched: {len(all_content)} items") | ||||
|         return all_content | ||||
| 
 | ||||
|     def format_markdown(self, items: List[Dict[str, Any]]) -> str: | ||||
|         """Format Instagram content as markdown.""" | ||||
|         markdown_sections = [] | ||||
|          | ||||
|         for item in items: | ||||
|             section = [] | ||||
|              | ||||
|             # ID | ||||
|             item_id = item.get('id', 'N/A') | ||||
|             section.append(f"# ID: {item_id}") | ||||
|             section.append("") | ||||
|              | ||||
|             # Type | ||||
|             item_type = item.get('type', 'post') | ||||
|             section.append(f"## Type: {item_type}") | ||||
|             section.append("") | ||||
|              | ||||
|             # Author | ||||
|             author = item.get('author', 'Unknown') | ||||
|             section.append(f"## Author: {author}") | ||||
|             section.append("") | ||||
|              | ||||
|             # Publish Date | ||||
|             pub_date = item.get('publish_date', '') | ||||
|             section.append(f"## Publish Date: {pub_date}") | ||||
|             section.append("") | ||||
|              | ||||
|             # Link | ||||
|             link = item.get('link', '') | ||||
|             section.append(f"## Link: {link}") | ||||
|             section.append("") | ||||
|              | ||||
|             # Engagement metrics | ||||
|             likes = item.get('likes') | ||||
|             if likes is not None: | ||||
|                 section.append(f"## Likes: {likes}") | ||||
|                 section.append("") | ||||
|              | ||||
|             comments = item.get('comments') | ||||
|             if comments is not None: | ||||
|                 section.append(f"## Comments: {comments}") | ||||
|                 section.append("") | ||||
|              | ||||
|             views = item.get('views') | ||||
|             if views is not None: | ||||
|                 section.append(f"## Views: {views}") | ||||
|                 section.append("") | ||||
|              | ||||
|             # Hashtags | ||||
|             hashtags = item.get('hashtags', []) | ||||
|             if hashtags: | ||||
|                 hashtags_str = ', '.join(hashtags) | ||||
|                 section.append(f"## Hashtags: {hashtags_str}") | ||||
|                 section.append("") | ||||
|              | ||||
|             # Mentions | ||||
|             mentions = item.get('mentions', []) | ||||
|             if mentions: | ||||
|                 mentions_str = ', '.join(mentions) | ||||
|                 section.append(f"## Mentions: {mentions_str}") | ||||
|                 section.append("") | ||||
|              | ||||
|             # Caption/Description | ||||
|             section.append("## Description:") | ||||
|             caption = item.get('caption', '') | ||||
|             if caption: | ||||
|                 # Limit caption to first 500 characters | ||||
|                 if len(caption) > 500: | ||||
|                     caption = caption[:500] + "..." | ||||
|                 section.append(caption) | ||||
|             section.append("") | ||||
|              | ||||
|             # Separator | ||||
|             section.append("-" * 50) | ||||
|             section.append("") | ||||
|              | ||||
|             markdown_sections.append('\n'.join(section)) | ||||
|          | ||||
|         return '\n'.join(markdown_sections) | ||||
| 
 | ||||
|     def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]: | ||||
|         """Get only new posts since last sync.""" | ||||
|         if not state: | ||||
|             return items | ||||
|          | ||||
|         last_post_id = state.get('last_post_id') | ||||
|          | ||||
|         if not last_post_id: | ||||
|             return items | ||||
|          | ||||
|         # Filter for posts newer than the last synced | ||||
|         new_items = [] | ||||
|         for item in items: | ||||
|             if item.get('id') == last_post_id: | ||||
|                 break  # Found the last synced post | ||||
|             new_items.append(item) | ||||
|          | ||||
|         return new_items | ||||
| 
 | ||||
|     def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]: | ||||
|         """Update state with latest post information.""" | ||||
|         if not items: | ||||
|             return state | ||||
|          | ||||
|         # Get the first item (most recent) | ||||
|         latest_item = items[0] | ||||
|          | ||||
|         state['last_post_id'] = latest_item.get('id') | ||||
|         state['last_post_date'] = latest_item.get('publish_date') | ||||
|         state['last_sync'] = datetime.now(self.tz).isoformat() | ||||
|         state['post_count'] = len([i for i in items if i.get('type') == 'post']) | ||||
|         state['story_count'] = len([i for i in items if i.get('type') == 'story']) | ||||
|         state['reel_count'] = len([i for i in items if i.get('type') == 'reel']) | ||||
|          | ||||
|         return state | ||||
							
								
								
									
										352
									
								
								src/orchestrator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										352
									
								
								src/orchestrator.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,352 @@ | |||
| #!/usr/bin/env python3 | ||||
| """ | ||||
| Orchestrator for running all scrapers in parallel. | ||||
| """ | ||||
| 
 | ||||
| import os | ||||
| import sys | ||||
| import time | ||||
| import logging | ||||
| import multiprocessing | ||||
| from pathlib import Path | ||||
| from typing import List, Dict, Any, Optional | ||||
| from datetime import datetime | ||||
| import pytz | ||||
| import json | ||||
| 
 | ||||
| # Import all scrapers | ||||
| from src.base_scraper import ScraperConfig | ||||
| from src.wordpress_scraper import WordPressScraper | ||||
| from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast | ||||
| from src.youtube_scraper import YouTubeScraper | ||||
| from src.instagram_scraper import InstagramScraper | ||||
| 
 | ||||
| 
 | ||||
| class ScraperOrchestrator: | ||||
|     """Orchestrator for running multiple scrapers in parallel.""" | ||||
|      | ||||
|     def __init__(self, base_data_dir: Path = Path("data"),  | ||||
|                  base_logs_dir: Path = Path("logs"), | ||||
|                  brand_name: str = "hvacknowitall", | ||||
|                  timezone: str = "America/Halifax"): | ||||
|         """Initialize the orchestrator.""" | ||||
|         self.base_data_dir = base_data_dir | ||||
|         self.base_logs_dir = base_logs_dir | ||||
|         self.brand_name = brand_name | ||||
|         self.timezone = timezone | ||||
|         self.tz = pytz.timezone(timezone) | ||||
|          | ||||
|         # Setup orchestrator logger | ||||
|         self.logger = self._setup_logger() | ||||
|          | ||||
|         # Initialize scrapers | ||||
|         self.scrapers = self._initialize_scrapers() | ||||
|          | ||||
|         # Statistics file | ||||
|         self.stats_file = self.base_data_dir / "orchestrator_stats.json" | ||||
|      | ||||
|     def _setup_logger(self) -> logging.Logger: | ||||
|         """Setup logger for orchestrator.""" | ||||
|         logger = logging.getLogger("hvacknowitall_orchestrator") | ||||
|         logger.setLevel(logging.INFO) | ||||
|          | ||||
|         # Console handler | ||||
|         console_handler = logging.StreamHandler() | ||||
|         console_handler.setLevel(logging.INFO) | ||||
|          | ||||
|         # File handler | ||||
|         log_file = self.base_logs_dir / "orchestrator.log" | ||||
|         log_file.parent.mkdir(parents=True, exist_ok=True) | ||||
|         file_handler = logging.FileHandler(log_file) | ||||
|         file_handler.setLevel(logging.DEBUG) | ||||
|          | ||||
|         # Formatter | ||||
|         formatter = logging.Formatter( | ||||
|             '%(asctime)s - %(name)s - %(levelname)s - %(message)s' | ||||
|         ) | ||||
|         console_handler.setFormatter(formatter) | ||||
|         file_handler.setFormatter(formatter) | ||||
|          | ||||
|         logger.addHandler(console_handler) | ||||
|         logger.addHandler(file_handler) | ||||
|          | ||||
|         return logger | ||||
|      | ||||
|     def _initialize_scrapers(self) -> List[tuple]: | ||||
|         """Initialize all scraper instances.""" | ||||
|         scrapers = [] | ||||
|          | ||||
|         # WordPress scraper | ||||
|         if os.getenv('WORDPRESS_API_URL'): | ||||
|             config = ScraperConfig( | ||||
|                 source_name="wordpress", | ||||
|                 brand_name=self.brand_name, | ||||
|                 data_dir=self.base_data_dir, | ||||
|                 logs_dir=self.base_logs_dir, | ||||
|                 timezone=self.timezone | ||||
|             ) | ||||
|             scrapers.append(("WordPress", WordPressScraper(config))) | ||||
|             self.logger.info("Initialized WordPress scraper") | ||||
|          | ||||
|         # MailChimp RSS scraper | ||||
|         if os.getenv('MAILCHIMP_RSS_URL'): | ||||
|             config = ScraperConfig( | ||||
|                 source_name="mailchimp", | ||||
|                 brand_name=self.brand_name, | ||||
|                 data_dir=self.base_data_dir, | ||||
|                 logs_dir=self.base_logs_dir, | ||||
|                 timezone=self.timezone | ||||
|             ) | ||||
|             scrapers.append(("MailChimp", RSSScraperMailChimp(config))) | ||||
|             self.logger.info("Initialized MailChimp RSS scraper") | ||||
|          | ||||
|         # Podcast RSS scraper | ||||
|         if os.getenv('PODCAST_RSS_URL'): | ||||
|             config = ScraperConfig( | ||||
|                 source_name="podcast", | ||||
|                 brand_name=self.brand_name, | ||||
|                 data_dir=self.base_data_dir, | ||||
|                 logs_dir=self.base_logs_dir, | ||||
|                 timezone=self.timezone | ||||
|             ) | ||||
|             scrapers.append(("Podcast", RSSScraperPodcast(config))) | ||||
|             self.logger.info("Initialized Podcast RSS scraper") | ||||
|          | ||||
|         # YouTube scraper | ||||
|         if os.getenv('YOUTUBE_CHANNEL_URL'): | ||||
|             config = ScraperConfig( | ||||
|                 source_name="youtube", | ||||
|                 brand_name=self.brand_name, | ||||
|                 data_dir=self.base_data_dir, | ||||
|                 logs_dir=self.base_logs_dir, | ||||
|                 timezone=self.timezone | ||||
|             ) | ||||
|             scrapers.append(("YouTube", YouTubeScraper(config))) | ||||
|             self.logger.info("Initialized YouTube scraper") | ||||
|          | ||||
|         # Instagram scraper | ||||
|         if os.getenv('INSTAGRAM_USERNAME'): | ||||
|             config = ScraperConfig( | ||||
|                 source_name="instagram", | ||||
|                 brand_name=self.brand_name, | ||||
|                 data_dir=self.base_data_dir, | ||||
|                 logs_dir=self.base_logs_dir, | ||||
|                 timezone=self.timezone | ||||
|             ) | ||||
|             scrapers.append(("Instagram", InstagramScraper(config))) | ||||
|             self.logger.info("Initialized Instagram scraper") | ||||
|          | ||||
|         return scrapers | ||||
|      | ||||
|     def _run_scraper(self, scraper_info: tuple) -> Dict[str, Any]: | ||||
|         """Run a single scraper and return results.""" | ||||
|         name, scraper = scraper_info | ||||
|         result = { | ||||
|             'name': name, | ||||
|             'status': 'pending', | ||||
|             'items_count': 0, | ||||
|             'new_items': 0, | ||||
|             'error': None, | ||||
|             'start_time': datetime.now(self.tz).isoformat(), | ||||
|             'end_time': None, | ||||
|             'duration_seconds': 0 | ||||
|         } | ||||
|          | ||||
|         try: | ||||
|             start_time = time.time() | ||||
|             self.logger.info(f"Starting {name} scraper...") | ||||
|              | ||||
|             # Load state | ||||
|             state = scraper.load_state() | ||||
|              | ||||
|             # Fetch content | ||||
|             items = scraper.fetch_content() | ||||
|             result['items_count'] = len(items) | ||||
|              | ||||
|             # Filter for incremental items | ||||
|             new_items = scraper.get_incremental_items(items, state) | ||||
|             result['new_items'] = len(new_items) | ||||
|              | ||||
|             if new_items: | ||||
|                 # Format as markdown | ||||
|                 markdown_content = scraper.format_markdown(new_items) | ||||
|                  | ||||
|                 # Archive existing file | ||||
|                 scraper.archive_current_file() | ||||
|                  | ||||
|                 # Save new markdown | ||||
|                 filename = scraper.generate_filename() | ||||
|                 file_path = self.base_data_dir / filename | ||||
|                  | ||||
|                 with open(file_path, 'w', encoding='utf-8') as f: | ||||
|                     f.write(markdown_content) | ||||
|                  | ||||
|                 self.logger.info(f"{name}: Saved {len(new_items)} new items to {filename}") | ||||
|                  | ||||
|                 # Update state | ||||
|                 new_state = scraper.update_state(state, items) | ||||
|                 scraper.save_state(new_state) | ||||
|             else: | ||||
|                 self.logger.info(f"{name}: No new items found") | ||||
|              | ||||
|             result['status'] = 'success' | ||||
|             result['end_time'] = datetime.now(self.tz).isoformat() | ||||
|             result['duration_seconds'] = round(time.time() - start_time, 2) | ||||
|              | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"{name} scraper failed: {e}") | ||||
|             result['status'] = 'error' | ||||
|             result['error'] = str(e) | ||||
|             result['end_time'] = datetime.now(self.tz).isoformat() | ||||
|             result['duration_seconds'] = round(time.time() - start_time, 2) | ||||
|          | ||||
|         return result | ||||
|      | ||||
|     def run_sequential(self) -> List[Dict[str, Any]]: | ||||
|         """Run all scrapers sequentially.""" | ||||
|         self.logger.info("Starting sequential scraping...") | ||||
|         results = [] | ||||
|          | ||||
|         for scraper_info in self.scrapers: | ||||
|             result = self._run_scraper(scraper_info) | ||||
|             results.append(result) | ||||
|          | ||||
|         return results | ||||
|      | ||||
|     def run_parallel(self, max_workers: Optional[int] = None) -> List[Dict[str, Any]]: | ||||
|         """Run all scrapers in parallel using multiprocessing.""" | ||||
|         self.logger.info(f"Starting parallel scraping with {max_workers or 'all'} workers...") | ||||
|          | ||||
|         if not self.scrapers: | ||||
|             self.logger.warning("No scrapers configured") | ||||
|             return [] | ||||
|          | ||||
|         # Use number of scrapers as max workers if not specified | ||||
|         if max_workers is None: | ||||
|             max_workers = len(self.scrapers) | ||||
|          | ||||
|         with multiprocessing.Pool(processes=max_workers) as pool: | ||||
|             results = pool.map(self._run_scraper, self.scrapers) | ||||
|          | ||||
|         return results | ||||
|      | ||||
|     def save_statistics(self, results: List[Dict[str, Any]]) -> None: | ||||
|         """Save run statistics to file.""" | ||||
|         stats = { | ||||
|             'run_time': datetime.now(self.tz).isoformat(), | ||||
|             'total_scrapers': len(results), | ||||
|             'successful': sum(1 for r in results if r['status'] == 'success'), | ||||
|             'failed': sum(1 for r in results if r['status'] == 'error'), | ||||
|             'total_items': sum(r['items_count'] for r in results), | ||||
|             'new_items': sum(r['new_items'] for r in results), | ||||
|             'total_duration': sum(r['duration_seconds'] for r in results), | ||||
|             'results': results | ||||
|         } | ||||
|          | ||||
|         # Load existing stats if file exists | ||||
|         all_stats = [] | ||||
|         if self.stats_file.exists(): | ||||
|             try: | ||||
|                 with open(self.stats_file, 'r') as f: | ||||
|                     all_stats = json.load(f) | ||||
|             except: | ||||
|                 pass | ||||
|          | ||||
|         # Append new stats (keep last 100 runs) | ||||
|         all_stats.append(stats) | ||||
|         if len(all_stats) > 100: | ||||
|             all_stats = all_stats[-100:] | ||||
|          | ||||
|         # Save to file | ||||
|         with open(self.stats_file, 'w') as f: | ||||
|             json.dump(all_stats, f, indent=2) | ||||
|          | ||||
|         self.logger.info(f"Statistics saved to {self.stats_file}") | ||||
|      | ||||
|     def print_summary(self, results: List[Dict[str, Any]]) -> None: | ||||
|         """Print a summary of the scraping results.""" | ||||
|         print("\n" + "="*60) | ||||
|         print("SCRAPING SUMMARY") | ||||
|         print("="*60) | ||||
|          | ||||
|         for result in results: | ||||
|             status_symbol = "✓" if result['status'] == 'success' else "✗" | ||||
|             print(f"\n{status_symbol} {result['name']}:") | ||||
|             print(f"  Status: {result['status']}") | ||||
|             print(f"  Items found: {result['items_count']}") | ||||
|             print(f"  New items: {result['new_items']}") | ||||
|             print(f"  Duration: {result['duration_seconds']}s") | ||||
|             if result['error']: | ||||
|                 print(f"  Error: {result['error']}") | ||||
|          | ||||
|         print("\n" + "-"*60) | ||||
|         print("TOTALS:") | ||||
|         print(f"  Successful: {sum(1 for r in results if r['status'] == 'success')}/{len(results)}") | ||||
|         print(f"  Total items: {sum(r['items_count'] for r in results)}") | ||||
|         print(f"  New items: {sum(r['new_items'] for r in results)}") | ||||
|         print(f"  Total time: {sum(r['duration_seconds'] for r in results):.2f}s") | ||||
|         print("="*60 + "\n") | ||||
|      | ||||
|     def run(self, parallel: bool = True, max_workers: Optional[int] = None) -> None: | ||||
|         """Main run method.""" | ||||
|         start_time = time.time() | ||||
|          | ||||
|         self.logger.info(f"Starting orchestrator at {datetime.now(self.tz).isoformat()}") | ||||
|         self.logger.info(f"Configured scrapers: {len(self.scrapers)}") | ||||
|          | ||||
|         if not self.scrapers: | ||||
|             self.logger.error("No scrapers configured. Please check your .env file.") | ||||
|             return | ||||
|          | ||||
|         # Run scrapers | ||||
|         if parallel: | ||||
|             results = self.run_parallel(max_workers) | ||||
|         else: | ||||
|             results = self.run_sequential() | ||||
|          | ||||
|         # Save statistics | ||||
|         self.save_statistics(results) | ||||
|          | ||||
|         # Print summary | ||||
|         self.print_summary(results) | ||||
|          | ||||
|         total_time = time.time() - start_time | ||||
|         self.logger.info(f"Orchestrator completed in {total_time:.2f} seconds") | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     """Main entry point.""" | ||||
|     import argparse | ||||
|     from dotenv import load_dotenv | ||||
|      | ||||
|     # Load environment variables | ||||
|     load_dotenv() | ||||
|      | ||||
|     # Parse arguments | ||||
|     parser = argparse.ArgumentParser(description="Run HVAC Know It All content scrapers") | ||||
|     parser.add_argument('--sequential', action='store_true',  | ||||
|                        help='Run scrapers sequentially instead of in parallel') | ||||
|     parser.add_argument('--max-workers', type=int, default=None, | ||||
|                        help='Maximum number of parallel workers') | ||||
|     parser.add_argument('--data-dir', type=str, default='data', | ||||
|                        help='Base data directory') | ||||
|     parser.add_argument('--logs-dir', type=str, default='logs', | ||||
|                        help='Base logs directory') | ||||
|      | ||||
|     args = parser.parse_args() | ||||
|      | ||||
|     # Create orchestrator | ||||
|     orchestrator = ScraperOrchestrator( | ||||
|         base_data_dir=Path(args.data_dir), | ||||
|         base_logs_dir=Path(args.logs_dir) | ||||
|     ) | ||||
|      | ||||
|     # Run scrapers | ||||
|     orchestrator.run( | ||||
|         parallel=not args.sequential, | ||||
|         max_workers=args.max_workers | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										
											BIN
										
									
								
								test_data/.sessions/bengizmo
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test_data/.sessions/bengizmo
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										1
									
								
								test_data/test_wordpress.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								test_data/test_wordpress.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| # Post 2 | ||||
							
								
								
									
										1
									
								
								test_data/test_youtube.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								test_data/test_youtube.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| # Video 1 | ||||
							
								
								
									
										271
									
								
								tests/test_instagram_scraper.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										271
									
								
								tests/test_instagram_scraper.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,271 @@ | |||
| import pytest | ||||
| from unittest.mock import Mock, patch, MagicMock, PropertyMock | ||||
| from datetime import datetime | ||||
| from pathlib import Path | ||||
| import random | ||||
| from src.instagram_scraper import InstagramScraper | ||||
| from src.base_scraper import ScraperConfig | ||||
| 
 | ||||
| 
 | ||||
| class TestInstagramScraper: | ||||
|     @pytest.fixture | ||||
|     def config(self): | ||||
|         return ScraperConfig( | ||||
|             source_name="instagram", | ||||
|             brand_name="hvacknowitall", | ||||
|             data_dir=Path("data"), | ||||
|             logs_dir=Path("logs"), | ||||
|             timezone="America/Halifax" | ||||
|         ) | ||||
|      | ||||
|     @pytest.fixture | ||||
|     def mock_env(self): | ||||
|         with patch.dict('os.environ', { | ||||
|             'INSTAGRAM_USERNAME': 'testuser', | ||||
|             'INSTAGRAM_PASSWORD': 'testpass', | ||||
|             'INSTAGRAM_TARGET': 'hvacknowitall' | ||||
|         }): | ||||
|             yield | ||||
| 
 | ||||
|     @pytest.fixture | ||||
|     def sample_post(self): | ||||
|         mock_post = MagicMock() | ||||
|         mock_post.shortcode = 'ABC123' | ||||
|         mock_post.caption = 'Test caption #hvac #tips' | ||||
|         mock_post.owner_username = 'hvacknowitall' | ||||
|         mock_post.date_utc = datetime(2024, 1, 1, 12, 0, 0) | ||||
|         mock_post.typename = 'GraphImage' | ||||
|         mock_post.url = 'https://www.instagram.com/p/ABC123/' | ||||
|         mock_post.likes = 150 | ||||
|         mock_post.comments = 25 | ||||
|         mock_post.video_view_count = None | ||||
|         mock_post.mediacount = 1 | ||||
|         mock_post.caption_hashtags = ['hvac', 'tips'] | ||||
|         mock_post.caption_mentions = [] | ||||
|         mock_post.is_video = False  # Explicitly set is_video to False | ||||
|         return mock_post | ||||
| 
 | ||||
|     @pytest.fixture | ||||
|     def sample_story(self): | ||||
|         mock_story = MagicMock() | ||||
|         mock_story.mediaid = 123456789 | ||||
|         mock_story.owner_username = 'hvacknowitall' | ||||
|         mock_story.date_utc = datetime(2024, 1, 1, 12, 0, 0) | ||||
|         mock_story.url = 'https://www.instagram.com/stories/hvacknowitall/123456789/' | ||||
|         mock_story.typename = 'GraphStoryImage' | ||||
|         mock_story.is_video = False  # Explicitly set is_video to False | ||||
|         return mock_story | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_initialization(self, mock_setup, mock_login, config, mock_env): | ||||
|         mock_setup.return_value = MagicMock() | ||||
|         scraper = InstagramScraper(config) | ||||
|         assert scraper.config == config | ||||
|         assert scraper.username == 'testuser' | ||||
|         assert scraper.password == 'testpass' | ||||
|         assert scraper.target_account == 'hvacknowitall' | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('instaloader.Instaloader') | ||||
|     def test_setup_loader(self, mock_instaloader_class, mock_login, config, mock_env): | ||||
|         mock_loader = MagicMock() | ||||
|         mock_instaloader_class.return_value = mock_loader | ||||
|          | ||||
|         scraper = InstagramScraper(config) | ||||
|          | ||||
|         # Test that instaloader was initialized with correct params | ||||
|         mock_instaloader_class.assert_called_once() | ||||
|         call_kwargs = mock_instaloader_class.call_args[1] | ||||
|         assert call_kwargs['quiet'] == True | ||||
|         assert call_kwargs['download_videos'] == False | ||||
|         assert call_kwargs['download_video_thumbnails'] == False | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     @patch('instaloader.Instaloader') | ||||
|     def test_login(self, mock_instaloader_class, mock_setup, config, mock_env): | ||||
|         mock_loader = MagicMock() | ||||
|         mock_setup.return_value = mock_loader | ||||
|          | ||||
|         # Create scraper without triggering login in __init__ | ||||
|         with patch('src.instagram_scraper.InstagramScraper._login'): | ||||
|             scraper = InstagramScraper(config) | ||||
|             scraper.loader = mock_loader | ||||
|          | ||||
|         # Now test login | ||||
|         scraper._login() | ||||
|          | ||||
|         # Should try to login with credentials since no session file exists | ||||
|         mock_loader.login.assert_called_once_with('testuser', 'testpass') | ||||
| 
 | ||||
|     @patch('time.sleep') | ||||
|     @patch('random.uniform') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_aggressive_delay(self, mock_setup, mock_login, mock_uniform, mock_sleep, config, mock_env): | ||||
|         mock_uniform.return_value = 7.5 | ||||
|         mock_setup.return_value = MagicMock() | ||||
|          | ||||
|         scraper = InstagramScraper(config) | ||||
|         scraper._aggressive_delay() | ||||
|          | ||||
|         mock_uniform.assert_called_with(5, 10) | ||||
|         mock_sleep.assert_called_with(7.5) | ||||
| 
 | ||||
|     @patch('instaloader.Profile.from_username') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_fetch_posts(self, mock_setup, mock_login, mock_profile_from_username,  | ||||
|                          config, mock_env, sample_post): | ||||
|         mock_loader = MagicMock() | ||||
|         mock_setup.return_value = mock_loader | ||||
|          | ||||
|         mock_profile = MagicMock() | ||||
|         mock_profile.get_posts.return_value = [sample_post] | ||||
|         mock_profile_from_username.return_value = mock_profile | ||||
|          | ||||
|         scraper = InstagramScraper(config) | ||||
|         scraper.loader = mock_loader | ||||
|         posts = scraper.fetch_posts(max_posts=10) | ||||
|          | ||||
|         assert len(posts) == 1 | ||||
|         assert posts[0]['id'] == 'ABC123' | ||||
|         assert posts[0]['type'] == 'post' | ||||
|         assert posts[0]['caption'] == 'Test caption #hvac #tips' | ||||
| 
 | ||||
|     @patch('instaloader.Profile.from_username') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_fetch_stories(self, mock_setup, mock_login, mock_profile_from_username,  | ||||
|                           config, mock_env, sample_story): | ||||
|         mock_loader = MagicMock() | ||||
|         mock_setup.return_value = mock_loader | ||||
|         # get_stories returns an iterable where each element is an iterable of story items | ||||
|         mock_loader.get_stories.return_value = [[sample_story]]  # Simplified: one story collection with one item | ||||
|          | ||||
|         mock_profile = MagicMock() | ||||
|         mock_profile.userid = 12345 | ||||
|         mock_profile_from_username.return_value = mock_profile | ||||
|          | ||||
|         scraper = InstagramScraper(config) | ||||
|         scraper.loader = mock_loader | ||||
|         stories = scraper.fetch_stories() | ||||
|          | ||||
|         assert len(stories) == 1 | ||||
|         assert stories[0]['id'] == 123456789 | ||||
|         assert stories[0]['type'] == 'story' | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_get_post_type(self, mock_setup, mock_login, config, mock_env): | ||||
|         mock_setup.return_value = MagicMock() | ||||
|         scraper = InstagramScraper(config) | ||||
|          | ||||
|         mock_post = MagicMock() | ||||
|          | ||||
|         # Test regular post | ||||
|         mock_post.typename = 'GraphImage' | ||||
|         mock_post.is_video = False | ||||
|         assert scraper._get_post_type(mock_post) == 'post' | ||||
|          | ||||
|         # Test video/reel | ||||
|         mock_post.typename = 'GraphVideo' | ||||
|         mock_post.is_video = True | ||||
|         assert scraper._get_post_type(mock_post) == 'reel' | ||||
|          | ||||
|         # Test carousel | ||||
|         mock_post.typename = 'GraphSidecar' | ||||
|         mock_post.is_video = False | ||||
|         assert scraper._get_post_type(mock_post) == 'post' | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_format_markdown(self, mock_setup, mock_login, config, mock_env): | ||||
|         mock_setup.return_value = MagicMock() | ||||
|         scraper = InstagramScraper(config) | ||||
|          | ||||
|         items = [ | ||||
|             { | ||||
|                 'id': 'ABC123', | ||||
|                 'type': 'post', | ||||
|                 'caption': 'Test post', | ||||
|                 'author': 'hvacknowitall', | ||||
|                 'publish_date': '2024-01-01T12:00:00', | ||||
|                 'link': 'https://www.instagram.com/p/ABC123/', | ||||
|                 'likes': 150, | ||||
|                 'comments': 25, | ||||
|                 'views': None, | ||||
|                 'hashtags': ['hvac', 'tips'] | ||||
|             } | ||||
|         ] | ||||
|          | ||||
|         markdown = scraper.format_markdown(items) | ||||
|          | ||||
|         assert '# ID: ABC123' in markdown | ||||
|         assert '## Type: post' in markdown | ||||
|         assert '## Author: hvacknowitall' in markdown | ||||
|         assert '## Publish Date: 2024-01-01T12:00:00' in markdown | ||||
|         assert '## Link: https://www.instagram.com/p/ABC123/' in markdown | ||||
|         assert '## Likes: 150' in markdown | ||||
|         assert '## Comments: 25' in markdown | ||||
|         assert '## Hashtags: hvac, tips' in markdown | ||||
|         assert 'Test post' in markdown | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_get_incremental_items(self, mock_setup, mock_login, config, mock_env): | ||||
|         mock_setup.return_value = MagicMock() | ||||
|         scraper = InstagramScraper(config) | ||||
|          | ||||
|         items = [ | ||||
|             {'id': 'post3', 'publish_date': '2024-01-03T12:00:00'}, | ||||
|             {'id': 'post2', 'publish_date': '2024-01-02T12:00:00'}, | ||||
|             {'id': 'post1', 'publish_date': '2024-01-01T12:00:00'} | ||||
|         ] | ||||
|          | ||||
|         # Test with no previous state | ||||
|         state = {} | ||||
|         new_items = scraper.get_incremental_items(items, state) | ||||
|         assert len(new_items) == 3 | ||||
|          | ||||
|         # Test with existing state | ||||
|         state = {'last_post_id': 'post2'} | ||||
|         new_items = scraper.get_incremental_items(items, state) | ||||
|         assert len(new_items) == 1 | ||||
|         assert new_items[0]['id'] == 'post3' | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._login') | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     def test_update_state(self, mock_setup, mock_login, config, mock_env): | ||||
|         mock_setup.return_value = MagicMock() | ||||
|         scraper = InstagramScraper(config) | ||||
|          | ||||
|         state = {} | ||||
|         items = [ | ||||
|             {'id': 'post2', 'publish_date': '2024-01-02T12:00:00', 'type': 'post'}, | ||||
|             {'id': 'post1', 'publish_date': '2024-01-01T12:00:00', 'type': 'post'} | ||||
|         ] | ||||
|          | ||||
|         updated_state = scraper.update_state(state, items) | ||||
|          | ||||
|         assert updated_state['last_post_id'] == 'post2' | ||||
|         assert updated_state['last_post_date'] == '2024-01-02T12:00:00' | ||||
|         assert updated_state['post_count'] == 2 | ||||
| 
 | ||||
|     @patch('src.instagram_scraper.InstagramScraper._setup_loader') | ||||
|     @patch('instaloader.Instaloader') | ||||
|     def test_error_handling(self, mock_instaloader_class, mock_setup, config, mock_env): | ||||
|         mock_loader = MagicMock() | ||||
|         mock_setup.return_value = mock_loader | ||||
|         mock_loader.login.side_effect = Exception("Login failed") | ||||
|          | ||||
|         # Test that login error is handled gracefully | ||||
|         with patch('src.instagram_scraper.InstagramScraper._login'): | ||||
|             scraper = InstagramScraper(config) | ||||
|             scraper.loader = mock_loader | ||||
|          | ||||
|         scraper._login()  # Should not raise, just log error | ||||
|          | ||||
|         # Test fetch error handling | ||||
|         posts = scraper.fetch_posts() | ||||
|         assert posts == [] | ||||
							
								
								
									
										186
									
								
								tests/test_orchestrator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										186
									
								
								tests/test_orchestrator.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,186 @@ | |||
| import pytest | ||||
| from unittest.mock import Mock, patch, MagicMock | ||||
| from pathlib import Path | ||||
| import json | ||||
| from src.orchestrator import ScraperOrchestrator | ||||
| from src.base_scraper import ScraperConfig | ||||
| 
 | ||||
| 
 | ||||
| class TestScraperOrchestrator: | ||||
|     @pytest.fixture | ||||
|     def orchestrator(self): | ||||
|         return ScraperOrchestrator( | ||||
|             base_data_dir=Path("test_data"), | ||||
|             base_logs_dir=Path("test_logs"), | ||||
|             brand_name="test_brand", | ||||
|             timezone="America/Halifax" | ||||
|         ) | ||||
|      | ||||
|     @pytest.fixture | ||||
|     def mock_scrapers(self): | ||||
|         """Create mock scrapers.""" | ||||
|         mock_wordpress = MagicMock() | ||||
|         mock_wordpress.load_state.return_value = {} | ||||
|         mock_wordpress.fetch_content.return_value = [ | ||||
|             {'id': '1', 'title': 'Post 1'}, | ||||
|             {'id': '2', 'title': 'Post 2'} | ||||
|         ] | ||||
|         mock_wordpress.get_incremental_items.return_value = [{'id': '2', 'title': 'Post 2'}] | ||||
|         mock_wordpress.format_markdown.return_value = "# Post 2" | ||||
|         mock_wordpress.generate_filename.return_value = "test_wordpress.md" | ||||
|         mock_wordpress.update_state.return_value = {'last_id': '2'} | ||||
|          | ||||
|         mock_youtube = MagicMock() | ||||
|         mock_youtube.load_state.return_value = {} | ||||
|         mock_youtube.fetch_content.return_value = [ | ||||
|             {'id': 'vid1', 'title': 'Video 1'} | ||||
|         ] | ||||
|         mock_youtube.get_incremental_items.return_value = [{'id': 'vid1', 'title': 'Video 1'}] | ||||
|         mock_youtube.format_markdown.return_value = "# Video 1" | ||||
|         mock_youtube.generate_filename.return_value = "test_youtube.md" | ||||
|         mock_youtube.update_state.return_value = {'last_video_id': 'vid1'} | ||||
|          | ||||
|         return [ | ||||
|             ("WordPress", mock_wordpress), | ||||
|             ("YouTube", mock_youtube) | ||||
|         ] | ||||
|      | ||||
|     def test_initialization(self, orchestrator): | ||||
|         assert orchestrator.base_data_dir == Path("test_data") | ||||
|         assert orchestrator.base_logs_dir == Path("test_logs") | ||||
|         assert orchestrator.brand_name == "test_brand" | ||||
|         assert orchestrator.timezone == "America/Halifax" | ||||
|      | ||||
|     @patch('src.orchestrator.InstagramScraper') | ||||
|     @patch('src.orchestrator.RSSScraperPodcast') | ||||
|     @patch('src.orchestrator.RSSScraperMailChimp') | ||||
|     @patch('src.orchestrator.WordPressScraper') | ||||
|     @patch('src.orchestrator.YouTubeScraper') | ||||
|     def test_initialize_scrapers(self, mock_youtube_class, mock_wordpress_class, | ||||
|                                  mock_mailchimp_class, mock_podcast_class, mock_instagram_class): | ||||
|         # Create a clean environment with only specific scrapers enabled | ||||
|         with patch.dict('os.environ', { | ||||
|             'WORDPRESS_API_URL': 'https://test.com/wp-json', | ||||
|             'YOUTUBE_CHANNEL_URL': 'https://youtube.com/@test', | ||||
|             # Clear other environment variables | ||||
|             'MAILCHIMP_RSS_URL': '', | ||||
|             'PODCAST_RSS_URL': '', | ||||
|             'INSTAGRAM_USERNAME': '' | ||||
|         }, clear=True): | ||||
|             orchestrator = ScraperOrchestrator() | ||||
|             # Should only have WordPress and YouTube scrapers | ||||
|             assert len(orchestrator.scrapers) == 2 | ||||
|             names = [name for name, _ in orchestrator.scrapers] | ||||
|             assert 'WordPress' in names | ||||
|             assert 'YouTube' in names | ||||
|      | ||||
|     def test_run_scraper_success(self, orchestrator, mock_scrapers): | ||||
|         orchestrator.scrapers = mock_scrapers | ||||
|          | ||||
|         # Run first scraper | ||||
|         result = orchestrator._run_scraper(mock_scrapers[0]) | ||||
|          | ||||
|         assert result['name'] == 'WordPress' | ||||
|         assert result['status'] == 'success' | ||||
|         assert result['items_count'] == 2 | ||||
|         assert result['new_items'] == 1 | ||||
|         assert result['error'] is None | ||||
|      | ||||
|     def test_run_scraper_error(self, orchestrator): | ||||
|         mock_scraper = MagicMock() | ||||
|         mock_scraper.load_state.side_effect = Exception("Test error") | ||||
|          | ||||
|         result = orchestrator._run_scraper(("TestScraper", mock_scraper)) | ||||
|          | ||||
|         assert result['name'] == 'TestScraper' | ||||
|         assert result['status'] == 'error' | ||||
|         assert result['error'] == "Test error" | ||||
|      | ||||
|     def test_run_sequential(self, orchestrator, mock_scrapers): | ||||
|         orchestrator.scrapers = mock_scrapers | ||||
|          | ||||
|         results = orchestrator.run_sequential() | ||||
|          | ||||
|         assert len(results) == 2 | ||||
|         assert results[0]['name'] == 'WordPress' | ||||
|         assert results[1]['name'] == 'YouTube' | ||||
|         assert all(r['status'] == 'success' for r in results) | ||||
|      | ||||
|     @patch('multiprocessing.Pool') | ||||
|     def test_run_parallel(self, mock_pool_class, orchestrator, mock_scrapers): | ||||
|         mock_pool = MagicMock() | ||||
|         mock_pool_class.return_value.__enter__.return_value = mock_pool | ||||
|          | ||||
|         # Mock the map function to return results | ||||
|         mock_pool.map.return_value = [ | ||||
|             {'name': 'WordPress', 'status': 'success', 'items_count': 2, 'new_items': 1,  | ||||
|              'error': None, 'duration_seconds': 1.0}, | ||||
|             {'name': 'YouTube', 'status': 'success', 'items_count': 1, 'new_items': 1, | ||||
|              'error': None, 'duration_seconds': 2.0} | ||||
|         ] | ||||
|          | ||||
|         orchestrator.scrapers = mock_scrapers | ||||
|         results = orchestrator.run_parallel(max_workers=2) | ||||
|          | ||||
|         assert len(results) == 2 | ||||
|         mock_pool_class.assert_called_once_with(processes=2) | ||||
|         mock_pool.map.assert_called_once() | ||||
|      | ||||
|     def test_save_statistics(self, orchestrator, tmp_path): | ||||
|         orchestrator.stats_file = tmp_path / "stats.json" | ||||
|          | ||||
|         results = [ | ||||
|             {'name': 'WordPress', 'status': 'success', 'items_count': 2,  | ||||
|              'new_items': 1, 'duration_seconds': 1.0, 'error': None}, | ||||
|             {'name': 'YouTube', 'status': 'error', 'items_count': 0, | ||||
|              'new_items': 0, 'duration_seconds': 0.5, 'error': 'Connection failed'} | ||||
|         ] | ||||
|          | ||||
|         orchestrator.save_statistics(results) | ||||
|          | ||||
|         # Check file was created | ||||
|         assert orchestrator.stats_file.exists() | ||||
|          | ||||
|         # Load and verify stats | ||||
|         with open(orchestrator.stats_file, 'r') as f: | ||||
|             stats = json.load(f) | ||||
|          | ||||
|         assert len(stats) == 1 | ||||
|         assert stats[0]['total_scrapers'] == 2 | ||||
|         assert stats[0]['successful'] == 1 | ||||
|         assert stats[0]['failed'] == 1 | ||||
|         assert stats[0]['total_items'] == 2 | ||||
|         assert stats[0]['new_items'] == 1 | ||||
|      | ||||
|     def test_print_summary(self, orchestrator, capsys): | ||||
|         results = [ | ||||
|             {'name': 'WordPress', 'status': 'success', 'items_count': 2, | ||||
|              'new_items': 1, 'duration_seconds': 1.0, 'error': None}, | ||||
|             {'name': 'YouTube', 'status': 'error', 'items_count': 0, | ||||
|              'new_items': 0, 'duration_seconds': 0.5, 'error': 'Connection failed'} | ||||
|         ] | ||||
|          | ||||
|         orchestrator.print_summary(results) | ||||
|          | ||||
|         captured = capsys.readouterr() | ||||
|         assert "SCRAPING SUMMARY" in captured.out | ||||
|         assert "✓ WordPress:" in captured.out | ||||
|         assert "✗ YouTube:" in captured.out | ||||
|         assert "Successful: 1/2" in captured.out | ||||
|         assert "Total items: 2" in captured.out | ||||
|      | ||||
|     @patch('src.orchestrator.ScraperOrchestrator.run_parallel') | ||||
|     @patch('src.orchestrator.ScraperOrchestrator.save_statistics') | ||||
|     @patch('src.orchestrator.ScraperOrchestrator.print_summary') | ||||
|     def test_run_method(self, mock_print, mock_save, mock_parallel, orchestrator): | ||||
|         mock_parallel.return_value = [ | ||||
|             {'name': 'Test', 'status': 'success', 'items_count': 1, | ||||
|              'new_items': 1, 'duration_seconds': 1.0, 'error': None} | ||||
|         ] | ||||
|          | ||||
|         orchestrator.scrapers = [("Test", MagicMock())] | ||||
|         orchestrator.run(parallel=True) | ||||
|          | ||||
|         mock_parallel.assert_called_once_with(None) | ||||
|         mock_save.assert_called_once() | ||||
|         mock_print.assert_called_once() | ||||
		Loading…
	
		Reference in a new issue