Add Instagram scraper with instaloader and parallel processing orchestrator

- Implement Instagram scraper with aggressive rate limiting - Add orchestrator for running all scrapers in parallel - Create comprehensive tests for Instagram scraper (11 tests) - Create tests for orchestrator (9 tests) - Fix Instagram test issues with post type detection - All 60 tests passing successfully 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 12:56:57 -03:00 · 2025-08-18 12:56:57 -03:00 · b89655c829
commit b89655c829
parent c1831d3a52
7 changed files with 1210 additions and 0 deletions
--- a/src/instagram_scraper.py
+++ b/src/instagram_scraper.py
@ -0,0 +1,399 @@
+import os
+import time
+import random
+from typing import Any, Dict, List, Optional
+from datetime import datetime
+from pathlib import Path
+import instaloader
+from src.base_scraper import BaseScraper, ScraperConfig
+
+
+class InstagramScraper(BaseScraper):
+    """Instagram scraper using instaloader with aggressive rate limiting."""
+    
+    def __init__(self, config: ScraperConfig):
+        super().__init__(config)
+        self.username = os.getenv('INSTAGRAM_USERNAME')
+        self.password = os.getenv('INSTAGRAM_PASSWORD')
+        self.target_account = os.getenv('INSTAGRAM_TARGET', 'hvacknowitall')
+        
+        # Session file for persistence
+        self.session_file = self.config.data_dir / '.sessions' / f'{self.username}'
+        self.session_file.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Initialize loader
+        self.loader = self._setup_loader()
+        self._login()
+        
+        # Request counter for rate limiting
+        self.request_count = 0
+        self.max_requests_per_hour = 100
+
+    def _setup_loader(self) -> instaloader.Instaloader:
+        """Setup Instaloader with conservative settings."""
+        loader = instaloader.Instaloader(
+            quiet=True,
+            user_agent='Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
+            dirname_pattern=str(self.config.data_dir / 'media' / 'Instagram'),
+            filename_pattern='{date_utc}_UTC_{shortcode}',
+            download_pictures=False,  # Don't download by default
+            download_videos=False,
+            download_video_thumbnails=False,
+            download_geotags=False,
+            download_comments=False,
+            save_metadata=False,
+            compress_json=False,
+            post_metadata_txt_pattern='',
+            storyitem_metadata_txt_pattern='',
+            max_connection_attempts=3,
+            request_timeout=30.0,
+            rate_controller=lambda x: time.sleep(random.uniform(5, 10))  # Built-in rate limiting
+        )
+        return loader
+
+    def _login(self) -> None:
+        """Login to Instagram or load existing session."""
+        try:
+            # Try to load existing session
+            if self.session_file.exists():
+                self.loader.load_session_from_file(str(self.session_file), self.username)
+                self.logger.info("Loaded existing Instagram session")
+            else:
+                # Login with credentials
+                self.logger.info("Logging in to Instagram...")
+                self.loader.login(self.username, self.password)
+                self.loader.save_session_to_file(str(self.session_file))
+                self.logger.info("Instagram login successful, session saved")
+                
+        except Exception as e:
+            self.logger.error(f"Instagram login error: {e}")
+
+    def _aggressive_delay(self, min_seconds: float = 5, max_seconds: float = 10) -> None:
+        """Add aggressive random delay for Instagram."""
+        delay = random.uniform(min_seconds, max_seconds)
+        self.logger.debug(f"Waiting {delay:.2f} seconds (Instagram rate limiting)...")
+        time.sleep(delay)
+
+    def _check_rate_limit(self) -> None:
+        """Check and enforce rate limiting."""
+        self.request_count += 1
+        
+        if self.request_count >= self.max_requests_per_hour:
+            self.logger.warning(f"Rate limit reached ({self.max_requests_per_hour} requests), pausing for 1 hour...")
+            time.sleep(3600)  # Wait 1 hour
+            self.request_count = 0
+        elif self.request_count % 10 == 0:
+            # Take a longer break every 10 requests
+            self.logger.info("Taking extended break after 10 requests...")
+            self._aggressive_delay(30, 60)
+
+    def _get_post_type(self, post) -> str:
+        """Determine post type from Instagram post object."""
+        typename = getattr(post, 'typename', '')
+        is_video = getattr(post, 'is_video', False)
+        
+        if typename == 'GraphStoryImage' or typename == 'GraphStoryVideo':
+            return 'story'
+        elif 'Video' in typename or is_video:
+            return 'reel'
+        else:
+            return 'post'
+
+    def fetch_posts(self, max_posts: int = 20) -> List[Dict[str, Any]]:
+        """Fetch posts from Instagram profile."""
+        posts_data = []
+        
+        try:
+            self.logger.info(f"Fetching posts from @{self.target_account}")
+            
+            # Get profile
+            profile = instaloader.Profile.from_username(self.loader.context, self.target_account)
+            self._check_rate_limit()
+            
+            # Get posts
+            posts = profile.get_posts()
+            
+            count = 0
+            for post in posts:
+                if count >= max_posts:
+                    break
+                
+                try:
+                    # Extract post data
+                    post_data = {
+                        'id': post.shortcode,
+                        'type': self._get_post_type(post),
+                        'caption': post.caption if post.caption else '',
+                        'author': post.owner_username,
+                        'publish_date': post.date_utc.isoformat(),
+                        'link': f'https://www.instagram.com/p/{post.shortcode}/',
+                        'likes': post.likes,
+                        'comments': post.comments,
+                        'views': post.video_view_count if hasattr(post, 'video_view_count') else None,
+                        'media_count': post.mediacount if hasattr(post, 'mediacount') else 1,
+                        'hashtags': list(post.caption_hashtags) if post.caption else [],
+                        'mentions': list(post.caption_mentions) if post.caption else [],
+                        'is_video': getattr(post, 'is_video', False)
+                    }
+                    
+                    posts_data.append(post_data)
+                    count += 1
+                    
+                    # Aggressive rate limiting between posts
+                    self._aggressive_delay()
+                    self._check_rate_limit()
+                    
+                    # Log progress
+                    if count % 5 == 0:
+                        self.logger.info(f"Fetched {count}/{max_posts} posts")
+                    
+                except Exception as e:
+                    self.logger.error(f"Error processing post: {e}")
+                    continue
+            
+            self.logger.info(f"Successfully fetched {len(posts_data)} posts")
+            
+        except Exception as e:
+            self.logger.error(f"Error fetching posts: {e}")
+        
+        return posts_data
+
+    def fetch_stories(self) -> List[Dict[str, Any]]:
+        """Fetch stories from Instagram profile."""
+        stories_data = []
+        
+        try:
+            self.logger.info(f"Fetching stories from @{self.target_account}")
+            
+            # Get profile
+            profile = instaloader.Profile.from_username(self.loader.context, self.target_account)
+            self._check_rate_limit()
+            
+            # Get user ID for stories
+            userid = profile.userid
+            
+            # Get stories
+            for story in self.loader.get_stories(userids=[userid]):
+                for item in story:
+                    try:
+                        story_data = {
+                            'id': item.mediaid,
+                            'type': 'story',
+                            'caption': '',  # Stories usually don't have captions
+                            'author': item.owner_username,
+                            'publish_date': item.date_utc.isoformat(),
+                            'link': f'https://www.instagram.com/stories/{item.owner_username}/{item.mediaid}/',
+                            'is_video': item.is_video if hasattr(item, 'is_video') else False
+                        }
+                        
+                        stories_data.append(story_data)
+                        
+                        # Rate limiting
+                        self._aggressive_delay()
+                        self._check_rate_limit()
+                        
+                    except Exception as e:
+                        self.logger.error(f"Error processing story: {e}")
+                        continue
+            
+            self.logger.info(f"Successfully fetched {len(stories_data)} stories")
+            
+        except Exception as e:
+            self.logger.error(f"Error fetching stories: {e}")
+        
+        return stories_data
+
+    def fetch_reels(self, max_reels: int = 10) -> List[Dict[str, Any]]:
+        """Fetch reels (videos) from Instagram profile."""
+        reels_data = []
+        
+        try:
+            self.logger.info(f"Fetching reels from @{self.target_account}")
+            
+            # Get profile
+            profile = instaloader.Profile.from_username(self.loader.context, self.target_account)
+            self._check_rate_limit()
+            
+            # Get posts and filter for videos/reels
+            posts = profile.get_posts()
+            
+            count = 0
+            for post in posts:
+                if count >= max_reels:
+                    break
+                
+                # Check if it's a video/reel
+                if not getattr(post, 'is_video', False):
+                    continue
+                
+                try:
+                    reel_data = {
+                        'id': post.shortcode,
+                        'type': 'reel',
+                        'caption': post.caption if post.caption else '',
+                        'author': post.owner_username,
+                        'publish_date': post.date_utc.isoformat(),
+                        'link': f'https://www.instagram.com/reel/{post.shortcode}/',
+                        'likes': post.likes,
+                        'comments': post.comments,
+                        'views': post.video_view_count if hasattr(post, 'video_view_count') else None,
+                        'duration': post.video_duration if hasattr(post, 'video_duration') else None,
+                        'hashtags': list(post.caption_hashtags) if post.caption else [],
+                        'mentions': list(post.caption_mentions) if post.caption else []
+                    }
+                    
+                    reels_data.append(reel_data)
+                    count += 1
+                    
+                    # Aggressive rate limiting
+                    self._aggressive_delay()
+                    self._check_rate_limit()
+                    
+                except Exception as e:
+                    self.logger.error(f"Error processing reel: {e}")
+                    continue
+            
+            self.logger.info(f"Successfully fetched {len(reels_data)} reels")
+            
+        except Exception as e:
+            self.logger.error(f"Error fetching reels: {e}")
+        
+        return reels_data
+
+    def fetch_content(self) -> List[Dict[str, Any]]:
+        """Fetch all content types from Instagram."""
+        all_content = []
+        
+        # Fetch posts
+        posts = self.fetch_posts(max_posts=20)
+        all_content.extend(posts)
+        
+        # Take a break between content types
+        self.logger.info("Taking break before fetching stories...")
+        self._aggressive_delay(15, 30)
+        
+        # Fetch stories
+        stories = self.fetch_stories()
+        all_content.extend(stories)
+        
+        # Note: Reels are included in posts (videos)
+        # so we don't need to fetch them separately
+        
+        self.logger.info(f"Total content fetched: {len(all_content)} items")
+        return all_content
+
+    def format_markdown(self, items: List[Dict[str, Any]]) -> str:
+        """Format Instagram content as markdown."""
+        markdown_sections = []
+        
+        for item in items:
+            section = []
+            
+            # ID
+            item_id = item.get('id', 'N/A')
+            section.append(f"# ID: {item_id}")
+            section.append("")
+            
+            # Type
+            item_type = item.get('type', 'post')
+            section.append(f"## Type: {item_type}")
+            section.append("")
+            
+            # Author
+            author = item.get('author', 'Unknown')
+            section.append(f"## Author: {author}")
+            section.append("")
+            
+            # Publish Date
+            pub_date = item.get('publish_date', '')
+            section.append(f"## Publish Date: {pub_date}")
+            section.append("")
+            
+            # Link
+            link = item.get('link', '')
+            section.append(f"## Link: {link}")
+            section.append("")
+            
+            # Engagement metrics
+            likes = item.get('likes')
+            if likes is not None:
+                section.append(f"## Likes: {likes}")
+                section.append("")
+            
+            comments = item.get('comments')
+            if comments is not None:
+                section.append(f"## Comments: {comments}")
+                section.append("")
+            
+            views = item.get('views')
+            if views is not None:
+                section.append(f"## Views: {views}")
+                section.append("")
+            
+            # Hashtags
+            hashtags = item.get('hashtags', [])
+            if hashtags:
+                hashtags_str = ', '.join(hashtags)
+                section.append(f"## Hashtags: {hashtags_str}")
+                section.append("")
+            
+            # Mentions
+            mentions = item.get('mentions', [])
+            if mentions:
+                mentions_str = ', '.join(mentions)
+                section.append(f"## Mentions: {mentions_str}")
+                section.append("")
+            
+            # Caption/Description
+            section.append("## Description:")
+            caption = item.get('caption', '')
+            if caption:
+                # Limit caption to first 500 characters
+                if len(caption) > 500:
+                    caption = caption[:500] + "..."
+                section.append(caption)
+            section.append("")
+            
+            # Separator
+            section.append("-" * 50)
+            section.append("")
+            
+            markdown_sections.append('\n'.join(section))
+        
+        return '\n'.join(markdown_sections)
+
+    def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Get only new posts since last sync."""
+        if not state:
+            return items
+        
+        last_post_id = state.get('last_post_id')
+        
+        if not last_post_id:
+            return items
+        
+        # Filter for posts newer than the last synced
+        new_items = []
+        for item in items:
+            if item.get('id') == last_post_id:
+                break  # Found the last synced post
+            new_items.append(item)
+        
+        return new_items
+
+    def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Update state with latest post information."""
+        if not items:
+            return state
+        
+        # Get the first item (most recent)
+        latest_item = items[0]
+        
+        state['last_post_id'] = latest_item.get('id')
+        state['last_post_date'] = latest_item.get('publish_date')
+        state['last_sync'] = datetime.now(self.tz).isoformat()
+        state['post_count'] = len([i for i in items if i.get('type') == 'post'])
+        state['story_count'] = len([i for i in items if i.get('type') == 'story'])
+        state['reel_count'] = len([i for i in items if i.get('type') == 'reel'])
+        
+        return state
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+"""
+Orchestrator for running all scrapers in parallel.
+"""
+
+import os
+import sys
+import time
+import logging
+import multiprocessing
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+import pytz
+import json
+
+# Import all scrapers
+from src.base_scraper import ScraperConfig
+from src.wordpress_scraper import WordPressScraper
+from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
+from src.youtube_scraper import YouTubeScraper
+from src.instagram_scraper import InstagramScraper
+
+
+class ScraperOrchestrator:
+    """Orchestrator for running multiple scrapers in parallel."""
+    
+    def __init__(self, base_data_dir: Path = Path("data"), 
+                 base_logs_dir: Path = Path("logs"),
+                 brand_name: str = "hvacknowitall",
+                 timezone: str = "America/Halifax"):
+        """Initialize the orchestrator."""
+        self.base_data_dir = base_data_dir
+        self.base_logs_dir = base_logs_dir
+        self.brand_name = brand_name
+        self.timezone = timezone
+        self.tz = pytz.timezone(timezone)
+        
+        # Setup orchestrator logger
+        self.logger = self._setup_logger()
+        
+        # Initialize scrapers
+        self.scrapers = self._initialize_scrapers()
+        
+        # Statistics file
+        self.stats_file = self.base_data_dir / "orchestrator_stats.json"
+    
+    def _setup_logger(self) -> logging.Logger:
+        """Setup logger for orchestrator."""
+        logger = logging.getLogger("hvacknowitall_orchestrator")
+        logger.setLevel(logging.INFO)
+        
+        # Console handler
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.INFO)
+        
+        # File handler
+        log_file = self.base_logs_dir / "orchestrator.log"
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(logging.DEBUG)
+        
+        # Formatter
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        console_handler.setFormatter(formatter)
+        file_handler.setFormatter(formatter)
+        
+        logger.addHandler(console_handler)
+        logger.addHandler(file_handler)
+        
+        return logger
+    
+    def _initialize_scrapers(self) -> List[tuple]:
+        """Initialize all scraper instances."""
+        scrapers = []
+        
+        # WordPress scraper
+        if os.getenv('WORDPRESS_API_URL'):
+            config = ScraperConfig(
+                source_name="wordpress",
+                brand_name=self.brand_name,
+                data_dir=self.base_data_dir,
+                logs_dir=self.base_logs_dir,
+                timezone=self.timezone
+            )
+            scrapers.append(("WordPress", WordPressScraper(config)))
+            self.logger.info("Initialized WordPress scraper")
+        
+        # MailChimp RSS scraper
+        if os.getenv('MAILCHIMP_RSS_URL'):
+            config = ScraperConfig(
+                source_name="mailchimp",
+                brand_name=self.brand_name,
+                data_dir=self.base_data_dir,
+                logs_dir=self.base_logs_dir,
+                timezone=self.timezone
+            )
+            scrapers.append(("MailChimp", RSSScraperMailChimp(config)))
+            self.logger.info("Initialized MailChimp RSS scraper")
+        
+        # Podcast RSS scraper
+        if os.getenv('PODCAST_RSS_URL'):
+            config = ScraperConfig(
+                source_name="podcast",
+                brand_name=self.brand_name,
+                data_dir=self.base_data_dir,
+                logs_dir=self.base_logs_dir,
+                timezone=self.timezone
+            )
+            scrapers.append(("Podcast", RSSScraperPodcast(config)))
+            self.logger.info("Initialized Podcast RSS scraper")
+        
+        # YouTube scraper
+        if os.getenv('YOUTUBE_CHANNEL_URL'):
+            config = ScraperConfig(
+                source_name="youtube",
+                brand_name=self.brand_name,
+                data_dir=self.base_data_dir,
+                logs_dir=self.base_logs_dir,
+                timezone=self.timezone
+            )
+            scrapers.append(("YouTube", YouTubeScraper(config)))
+            self.logger.info("Initialized YouTube scraper")
+        
+        # Instagram scraper
+        if os.getenv('INSTAGRAM_USERNAME'):
+            config = ScraperConfig(
+                source_name="instagram",
+                brand_name=self.brand_name,
+                data_dir=self.base_data_dir,
+                logs_dir=self.base_logs_dir,
+                timezone=self.timezone
+            )
+            scrapers.append(("Instagram", InstagramScraper(config)))
+            self.logger.info("Initialized Instagram scraper")
+        
+        return scrapers
+    
+    def _run_scraper(self, scraper_info: tuple) -> Dict[str, Any]:
+        """Run a single scraper and return results."""
+        name, scraper = scraper_info
+        result = {
+            'name': name,
+            'status': 'pending',
+            'items_count': 0,
+            'new_items': 0,
+            'error': None,
+            'start_time': datetime.now(self.tz).isoformat(),
+            'end_time': None,
+            'duration_seconds': 0
+        }
+        
+        try:
+            start_time = time.time()
+            self.logger.info(f"Starting {name} scraper...")
+            
+            # Load state
+            state = scraper.load_state()
+            
+            # Fetch content
+            items = scraper.fetch_content()
+            result['items_count'] = len(items)
+            
+            # Filter for incremental items
+            new_items = scraper.get_incremental_items(items, state)
+            result['new_items'] = len(new_items)
+            
+            if new_items:
+                # Format as markdown
+                markdown_content = scraper.format_markdown(new_items)
+                
+                # Archive existing file
+                scraper.archive_current_file()
+                
+                # Save new markdown
+                filename = scraper.generate_filename()
+                file_path = self.base_data_dir / filename
+                
+                with open(file_path, 'w', encoding='utf-8') as f:
+                    f.write(markdown_content)
+                
+                self.logger.info(f"{name}: Saved {len(new_items)} new items to {filename}")
+                
+                # Update state
+                new_state = scraper.update_state(state, items)
+                scraper.save_state(new_state)
+            else:
+                self.logger.info(f"{name}: No new items found")
+            
+            result['status'] = 'success'
+            result['end_time'] = datetime.now(self.tz).isoformat()
+            result['duration_seconds'] = round(time.time() - start_time, 2)
+            
+        except Exception as e:
+            self.logger.error(f"{name} scraper failed: {e}")
+            result['status'] = 'error'
+            result['error'] = str(e)
+            result['end_time'] = datetime.now(self.tz).isoformat()
+            result['duration_seconds'] = round(time.time() - start_time, 2)
+        
+        return result
+    
+    def run_sequential(self) -> List[Dict[str, Any]]:
+        """Run all scrapers sequentially."""
+        self.logger.info("Starting sequential scraping...")
+        results = []
+        
+        for scraper_info in self.scrapers:
+            result = self._run_scraper(scraper_info)
+            results.append(result)
+        
+        return results
+    
+    def run_parallel(self, max_workers: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Run all scrapers in parallel using multiprocessing."""
+        self.logger.info(f"Starting parallel scraping with {max_workers or 'all'} workers...")
+        
+        if not self.scrapers:
+            self.logger.warning("No scrapers configured")
+            return []
+        
+        # Use number of scrapers as max workers if not specified
+        if max_workers is None:
+            max_workers = len(self.scrapers)
+        
+        with multiprocessing.Pool(processes=max_workers) as pool:
+            results = pool.map(self._run_scraper, self.scrapers)
+        
+        return results
+    
+    def save_statistics(self, results: List[Dict[str, Any]]) -> None:
+        """Save run statistics to file."""
+        stats = {
+            'run_time': datetime.now(self.tz).isoformat(),
+            'total_scrapers': len(results),
+            'successful': sum(1 for r in results if r['status'] == 'success'),
+            'failed': sum(1 for r in results if r['status'] == 'error'),
+            'total_items': sum(r['items_count'] for r in results),
+            'new_items': sum(r['new_items'] for r in results),
+            'total_duration': sum(r['duration_seconds'] for r in results),
+            'results': results
+        }
+        
+        # Load existing stats if file exists
+        all_stats = []
+        if self.stats_file.exists():
+            try:
+                with open(self.stats_file, 'r') as f:
+                    all_stats = json.load(f)
+            except:
+                pass
+        
+        # Append new stats (keep last 100 runs)
+        all_stats.append(stats)
+        if len(all_stats) > 100:
+            all_stats = all_stats[-100:]
+        
+        # Save to file
+        with open(self.stats_file, 'w') as f:
+            json.dump(all_stats, f, indent=2)
+        
+        self.logger.info(f"Statistics saved to {self.stats_file}")
+    
+    def print_summary(self, results: List[Dict[str, Any]]) -> None:
+        """Print a summary of the scraping results."""
+        print("\n" + "="*60)
+        print("SCRAPING SUMMARY")
+        print("="*60)
+        
+        for result in results:
+            status_symbol = "✓" if result['status'] == 'success' else "✗"
+            print(f"\n{status_symbol} {result['name']}:")
+            print(f"  Status: {result['status']}")
+            print(f"  Items found: {result['items_count']}")
+            print(f"  New items: {result['new_items']}")
+            print(f"  Duration: {result['duration_seconds']}s")
+            if result['error']:
+                print(f"  Error: {result['error']}")
+        
+        print("\n" + "-"*60)
+        print("TOTALS:")
+        print(f"  Successful: {sum(1 for r in results if r['status'] == 'success')}/{len(results)}")
+        print(f"  Total items: {sum(r['items_count'] for r in results)}")
+        print(f"  New items: {sum(r['new_items'] for r in results)}")
+        print(f"  Total time: {sum(r['duration_seconds'] for r in results):.2f}s")
+        print("="*60 + "\n")
+    
+    def run(self, parallel: bool = True, max_workers: Optional[int] = None) -> None:
+        """Main run method."""
+        start_time = time.time()
+        
+        self.logger.info(f"Starting orchestrator at {datetime.now(self.tz).isoformat()}")
+        self.logger.info(f"Configured scrapers: {len(self.scrapers)}")
+        
+        if not self.scrapers:
+            self.logger.error("No scrapers configured. Please check your .env file.")
+            return
+        
+        # Run scrapers
+        if parallel:
+            results = self.run_parallel(max_workers)
+        else:
+            results = self.run_sequential()
+        
+        # Save statistics
+        self.save_statistics(results)
+        
+        # Print summary
+        self.print_summary(results)
+        
+        total_time = time.time() - start_time
+        self.logger.info(f"Orchestrator completed in {total_time:.2f} seconds")
+
+
+def main():
+    """Main entry point."""
+    import argparse
+    from dotenv import load_dotenv
+    
+    # Load environment variables
+    load_dotenv()
+    
+    # Parse arguments
+    parser = argparse.ArgumentParser(description="Run HVAC Know It All content scrapers")
+    parser.add_argument('--sequential', action='store_true', 
+                       help='Run scrapers sequentially instead of in parallel')
+    parser.add_argument('--max-workers', type=int, default=None,
+                       help='Maximum number of parallel workers')
+    parser.add_argument('--data-dir', type=str, default='data',
+                       help='Base data directory')
+    parser.add_argument('--logs-dir', type=str, default='logs',
+                       help='Base logs directory')
+    
+    args = parser.parse_args()
+    
+    # Create orchestrator
+    orchestrator = ScraperOrchestrator(
+        base_data_dir=Path(args.data_dir),
+        base_logs_dir=Path(args.logs_dir)
+    )
+    
+    # Run scrapers
+    orchestrator.run(
+        parallel=not args.sequential,
+        max_workers=args.max_workers
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/test_data/.sessions/bengizmo
+++ b/test_data/.sessions/bengizmo
--- a/test_data/test_wordpress.md
+++ b/test_data/test_wordpress.md
@ -0,0 +1 @@
+# Post 2
--- a/test_data/test_youtube.md
+++ b/test_data/test_youtube.md
@ -0,0 +1 @@
+# Video 1
--- a/tests/test_instagram_scraper.py
+++ b/tests/test_instagram_scraper.py
@ -0,0 +1,271 @@
+import pytest
+from unittest.mock import Mock, patch, MagicMock, PropertyMock
+from datetime import datetime
+from pathlib import Path
+import random
+from src.instagram_scraper import InstagramScraper
+from src.base_scraper import ScraperConfig
+
+
+class TestInstagramScraper:
+    @pytest.fixture
+    def config(self):
+        return ScraperConfig(
+            source_name="instagram",
+            brand_name="hvacknowitall",
+            data_dir=Path("data"),
+            logs_dir=Path("logs"),
+            timezone="America/Halifax"
+        )
+    
+    @pytest.fixture
+    def mock_env(self):
+        with patch.dict('os.environ', {
+            'INSTAGRAM_USERNAME': 'testuser',
+            'INSTAGRAM_PASSWORD': 'testpass',
+            'INSTAGRAM_TARGET': 'hvacknowitall'
+        }):
+            yield
+
+    @pytest.fixture
+    def sample_post(self):
+        mock_post = MagicMock()
+        mock_post.shortcode = 'ABC123'
+        mock_post.caption = 'Test caption #hvac #tips'
+        mock_post.owner_username = 'hvacknowitall'
+        mock_post.date_utc = datetime(2024, 1, 1, 12, 0, 0)
+        mock_post.typename = 'GraphImage'
+        mock_post.url = 'https://www.instagram.com/p/ABC123/'
+        mock_post.likes = 150
+        mock_post.comments = 25
+        mock_post.video_view_count = None
+        mock_post.mediacount = 1
+        mock_post.caption_hashtags = ['hvac', 'tips']
+        mock_post.caption_mentions = []
+        mock_post.is_video = False  # Explicitly set is_video to False
+        return mock_post
+
+    @pytest.fixture
+    def sample_story(self):
+        mock_story = MagicMock()
+        mock_story.mediaid = 123456789
+        mock_story.owner_username = 'hvacknowitall'
+        mock_story.date_utc = datetime(2024, 1, 1, 12, 0, 0)
+        mock_story.url = 'https://www.instagram.com/stories/hvacknowitall/123456789/'
+        mock_story.typename = 'GraphStoryImage'
+        mock_story.is_video = False  # Explicitly set is_video to False
+        return mock_story
+
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_initialization(self, mock_setup, mock_login, config, mock_env):
+        mock_setup.return_value = MagicMock()
+        scraper = InstagramScraper(config)
+        assert scraper.config == config
+        assert scraper.username == 'testuser'
+        assert scraper.password == 'testpass'
+        assert scraper.target_account == 'hvacknowitall'
+
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('instaloader.Instaloader')
+    def test_setup_loader(self, mock_instaloader_class, mock_login, config, mock_env):
+        mock_loader = MagicMock()
+        mock_instaloader_class.return_value = mock_loader
+        
+        scraper = InstagramScraper(config)
+        
+        # Test that instaloader was initialized with correct params
+        mock_instaloader_class.assert_called_once()
+        call_kwargs = mock_instaloader_class.call_args[1]
+        assert call_kwargs['quiet'] == True
+        assert call_kwargs['download_videos'] == False
+        assert call_kwargs['download_video_thumbnails'] == False
+
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    @patch('instaloader.Instaloader')
+    def test_login(self, mock_instaloader_class, mock_setup, config, mock_env):
+        mock_loader = MagicMock()
+        mock_setup.return_value = mock_loader
+        
+        # Create scraper without triggering login in __init__
+        with patch('src.instagram_scraper.InstagramScraper._login'):
+            scraper = InstagramScraper(config)
+            scraper.loader = mock_loader
+        
+        # Now test login
+        scraper._login()
+        
+        # Should try to login with credentials since no session file exists
+        mock_loader.login.assert_called_once_with('testuser', 'testpass')
+
+    @patch('time.sleep')
+    @patch('random.uniform')
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_aggressive_delay(self, mock_setup, mock_login, mock_uniform, mock_sleep, config, mock_env):
+        mock_uniform.return_value = 7.5
+        mock_setup.return_value = MagicMock()
+        
+        scraper = InstagramScraper(config)
+        scraper._aggressive_delay()
+        
+        mock_uniform.assert_called_with(5, 10)
+        mock_sleep.assert_called_with(7.5)
+
+    @patch('instaloader.Profile.from_username')
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_fetch_posts(self, mock_setup, mock_login, mock_profile_from_username, 
+                         config, mock_env, sample_post):
+        mock_loader = MagicMock()
+        mock_setup.return_value = mock_loader
+        
+        mock_profile = MagicMock()
+        mock_profile.get_posts.return_value = [sample_post]
+        mock_profile_from_username.return_value = mock_profile
+        
+        scraper = InstagramScraper(config)
+        scraper.loader = mock_loader
+        posts = scraper.fetch_posts(max_posts=10)
+        
+        assert len(posts) == 1
+        assert posts[0]['id'] == 'ABC123'
+        assert posts[0]['type'] == 'post'
+        assert posts[0]['caption'] == 'Test caption #hvac #tips'
+
+    @patch('instaloader.Profile.from_username')
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_fetch_stories(self, mock_setup, mock_login, mock_profile_from_username, 
+                          config, mock_env, sample_story):
+        mock_loader = MagicMock()
+        mock_setup.return_value = mock_loader
+        # get_stories returns an iterable where each element is an iterable of story items
+        mock_loader.get_stories.return_value = [[sample_story]]  # Simplified: one story collection with one item
+        
+        mock_profile = MagicMock()
+        mock_profile.userid = 12345
+        mock_profile_from_username.return_value = mock_profile
+        
+        scraper = InstagramScraper(config)
+        scraper.loader = mock_loader
+        stories = scraper.fetch_stories()
+        
+        assert len(stories) == 1
+        assert stories[0]['id'] == 123456789
+        assert stories[0]['type'] == 'story'
+
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_get_post_type(self, mock_setup, mock_login, config, mock_env):
+        mock_setup.return_value = MagicMock()
+        scraper = InstagramScraper(config)
+        
+        mock_post = MagicMock()
+        
+        # Test regular post
+        mock_post.typename = 'GraphImage'
+        mock_post.is_video = False
+        assert scraper._get_post_type(mock_post) == 'post'
+        
+        # Test video/reel
+        mock_post.typename = 'GraphVideo'
+        mock_post.is_video = True
+        assert scraper._get_post_type(mock_post) == 'reel'
+        
+        # Test carousel
+        mock_post.typename = 'GraphSidecar'
+        mock_post.is_video = False
+        assert scraper._get_post_type(mock_post) == 'post'
+
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_format_markdown(self, mock_setup, mock_login, config, mock_env):
+        mock_setup.return_value = MagicMock()
+        scraper = InstagramScraper(config)
+        
+        items = [
+            {
+                'id': 'ABC123',
+                'type': 'post',
+                'caption': 'Test post',
+                'author': 'hvacknowitall',
+                'publish_date': '2024-01-01T12:00:00',
+                'link': 'https://www.instagram.com/p/ABC123/',
+                'likes': 150,
+                'comments': 25,
+                'views': None,
+                'hashtags': ['hvac', 'tips']
+            }
+        ]
+        
+        markdown = scraper.format_markdown(items)
+        
+        assert '# ID: ABC123' in markdown
+        assert '## Type: post' in markdown
+        assert '## Author: hvacknowitall' in markdown
+        assert '## Publish Date: 2024-01-01T12:00:00' in markdown
+        assert '## Link: https://www.instagram.com/p/ABC123/' in markdown
+        assert '## Likes: 150' in markdown
+        assert '## Comments: 25' in markdown
+        assert '## Hashtags: hvac, tips' in markdown
+        assert 'Test post' in markdown
+
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_get_incremental_items(self, mock_setup, mock_login, config, mock_env):
+        mock_setup.return_value = MagicMock()
+        scraper = InstagramScraper(config)
+        
+        items = [
+            {'id': 'post3', 'publish_date': '2024-01-03T12:00:00'},
+            {'id': 'post2', 'publish_date': '2024-01-02T12:00:00'},
+            {'id': 'post1', 'publish_date': '2024-01-01T12:00:00'}
+        ]
+        
+        # Test with no previous state
+        state = {}
+        new_items = scraper.get_incremental_items(items, state)
+        assert len(new_items) == 3
+        
+        # Test with existing state
+        state = {'last_post_id': 'post2'}
+        new_items = scraper.get_incremental_items(items, state)
+        assert len(new_items) == 1
+        assert new_items[0]['id'] == 'post3'
+
+    @patch('src.instagram_scraper.InstagramScraper._login')
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    def test_update_state(self, mock_setup, mock_login, config, mock_env):
+        mock_setup.return_value = MagicMock()
+        scraper = InstagramScraper(config)
+        
+        state = {}
+        items = [
+            {'id': 'post2', 'publish_date': '2024-01-02T12:00:00', 'type': 'post'},
+            {'id': 'post1', 'publish_date': '2024-01-01T12:00:00', 'type': 'post'}
+        ]
+        
+        updated_state = scraper.update_state(state, items)
+        
+        assert updated_state['last_post_id'] == 'post2'
+        assert updated_state['last_post_date'] == '2024-01-02T12:00:00'
+        assert updated_state['post_count'] == 2
+
+    @patch('src.instagram_scraper.InstagramScraper._setup_loader')
+    @patch('instaloader.Instaloader')
+    def test_error_handling(self, mock_instaloader_class, mock_setup, config, mock_env):
+        mock_loader = MagicMock()
+        mock_setup.return_value = mock_loader
+        mock_loader.login.side_effect = Exception("Login failed")
+        
+        # Test that login error is handled gracefully
+        with patch('src.instagram_scraper.InstagramScraper._login'):
+            scraper = InstagramScraper(config)
+            scraper.loader = mock_loader
+        
+        scraper._login()  # Should not raise, just log error
+        
+        # Test fetch error handling
+        posts = scraper.fetch_posts()
+        assert posts == []
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@ -0,0 +1,186 @@
+import pytest
+from unittest.mock import Mock, patch, MagicMock
+from pathlib import Path
+import json
+from src.orchestrator import ScraperOrchestrator
+from src.base_scraper import ScraperConfig
+
+
+class TestScraperOrchestrator:
+    @pytest.fixture
+    def orchestrator(self):
+        return ScraperOrchestrator(
+            base_data_dir=Path("test_data"),
+            base_logs_dir=Path("test_logs"),
+            brand_name="test_brand",
+            timezone="America/Halifax"
+        )
+    
+    @pytest.fixture
+    def mock_scrapers(self):
+        """Create mock scrapers."""
+        mock_wordpress = MagicMock()
+        mock_wordpress.load_state.return_value = {}
+        mock_wordpress.fetch_content.return_value = [
+            {'id': '1', 'title': 'Post 1'},
+            {'id': '2', 'title': 'Post 2'}
+        ]
+        mock_wordpress.get_incremental_items.return_value = [{'id': '2', 'title': 'Post 2'}]
+        mock_wordpress.format_markdown.return_value = "# Post 2"
+        mock_wordpress.generate_filename.return_value = "test_wordpress.md"
+        mock_wordpress.update_state.return_value = {'last_id': '2'}
+        
+        mock_youtube = MagicMock()
+        mock_youtube.load_state.return_value = {}
+        mock_youtube.fetch_content.return_value = [
+            {'id': 'vid1', 'title': 'Video 1'}
+        ]
+        mock_youtube.get_incremental_items.return_value = [{'id': 'vid1', 'title': 'Video 1'}]
+        mock_youtube.format_markdown.return_value = "# Video 1"
+        mock_youtube.generate_filename.return_value = "test_youtube.md"
+        mock_youtube.update_state.return_value = {'last_video_id': 'vid1'}
+        
+        return [
+            ("WordPress", mock_wordpress),
+            ("YouTube", mock_youtube)
+        ]
+    
+    def test_initialization(self, orchestrator):
+        assert orchestrator.base_data_dir == Path("test_data")
+        assert orchestrator.base_logs_dir == Path("test_logs")
+        assert orchestrator.brand_name == "test_brand"
+        assert orchestrator.timezone == "America/Halifax"
+    
+    @patch('src.orchestrator.InstagramScraper')
+    @patch('src.orchestrator.RSSScraperPodcast')
+    @patch('src.orchestrator.RSSScraperMailChimp')
+    @patch('src.orchestrator.WordPressScraper')
+    @patch('src.orchestrator.YouTubeScraper')
+    def test_initialize_scrapers(self, mock_youtube_class, mock_wordpress_class,
+                                 mock_mailchimp_class, mock_podcast_class, mock_instagram_class):
+        # Create a clean environment with only specific scrapers enabled
+        with patch.dict('os.environ', {
+            'WORDPRESS_API_URL': 'https://test.com/wp-json',
+            'YOUTUBE_CHANNEL_URL': 'https://youtube.com/@test',
+            # Clear other environment variables
+            'MAILCHIMP_RSS_URL': '',
+            'PODCAST_RSS_URL': '',
+            'INSTAGRAM_USERNAME': ''
+        }, clear=True):
+            orchestrator = ScraperOrchestrator()
+            # Should only have WordPress and YouTube scrapers
+            assert len(orchestrator.scrapers) == 2
+            names = [name for name, _ in orchestrator.scrapers]
+            assert 'WordPress' in names
+            assert 'YouTube' in names
+    
+    def test_run_scraper_success(self, orchestrator, mock_scrapers):
+        orchestrator.scrapers = mock_scrapers
+        
+        # Run first scraper
+        result = orchestrator._run_scraper(mock_scrapers[0])
+        
+        assert result['name'] == 'WordPress'
+        assert result['status'] == 'success'
+        assert result['items_count'] == 2
+        assert result['new_items'] == 1
+        assert result['error'] is None
+    
+    def test_run_scraper_error(self, orchestrator):
+        mock_scraper = MagicMock()
+        mock_scraper.load_state.side_effect = Exception("Test error")
+        
+        result = orchestrator._run_scraper(("TestScraper", mock_scraper))
+        
+        assert result['name'] == 'TestScraper'
+        assert result['status'] == 'error'
+        assert result['error'] == "Test error"
+    
+    def test_run_sequential(self, orchestrator, mock_scrapers):
+        orchestrator.scrapers = mock_scrapers
+        
+        results = orchestrator.run_sequential()
+        
+        assert len(results) == 2
+        assert results[0]['name'] == 'WordPress'
+        assert results[1]['name'] == 'YouTube'
+        assert all(r['status'] == 'success' for r in results)
+    
+    @patch('multiprocessing.Pool')
+    def test_run_parallel(self, mock_pool_class, orchestrator, mock_scrapers):
+        mock_pool = MagicMock()
+        mock_pool_class.return_value.__enter__.return_value = mock_pool
+        
+        # Mock the map function to return results
+        mock_pool.map.return_value = [
+            {'name': 'WordPress', 'status': 'success', 'items_count': 2, 'new_items': 1, 
+             'error': None, 'duration_seconds': 1.0},
+            {'name': 'YouTube', 'status': 'success', 'items_count': 1, 'new_items': 1,
+             'error': None, 'duration_seconds': 2.0}
+        ]
+        
+        orchestrator.scrapers = mock_scrapers
+        results = orchestrator.run_parallel(max_workers=2)
+        
+        assert len(results) == 2
+        mock_pool_class.assert_called_once_with(processes=2)
+        mock_pool.map.assert_called_once()
+    
+    def test_save_statistics(self, orchestrator, tmp_path):
+        orchestrator.stats_file = tmp_path / "stats.json"
+        
+        results = [
+            {'name': 'WordPress', 'status': 'success', 'items_count': 2, 
+             'new_items': 1, 'duration_seconds': 1.0, 'error': None},
+            {'name': 'YouTube', 'status': 'error', 'items_count': 0,
+             'new_items': 0, 'duration_seconds': 0.5, 'error': 'Connection failed'}
+        ]
+        
+        orchestrator.save_statistics(results)
+        
+        # Check file was created
+        assert orchestrator.stats_file.exists()
+        
+        # Load and verify stats
+        with open(orchestrator.stats_file, 'r') as f:
+            stats = json.load(f)
+        
+        assert len(stats) == 1
+        assert stats[0]['total_scrapers'] == 2
+        assert stats[0]['successful'] == 1
+        assert stats[0]['failed'] == 1
+        assert stats[0]['total_items'] == 2
+        assert stats[0]['new_items'] == 1
+    
+    def test_print_summary(self, orchestrator, capsys):
+        results = [
+            {'name': 'WordPress', 'status': 'success', 'items_count': 2,
+             'new_items': 1, 'duration_seconds': 1.0, 'error': None},
+            {'name': 'YouTube', 'status': 'error', 'items_count': 0,
+             'new_items': 0, 'duration_seconds': 0.5, 'error': 'Connection failed'}
+        ]
+        
+        orchestrator.print_summary(results)
+        
+        captured = capsys.readouterr()
+        assert "SCRAPING SUMMARY" in captured.out
+        assert "✓ WordPress:" in captured.out
+        assert "✗ YouTube:" in captured.out
+        assert "Successful: 1/2" in captured.out
+        assert "Total items: 2" in captured.out
+    
+    @patch('src.orchestrator.ScraperOrchestrator.run_parallel')
+    @patch('src.orchestrator.ScraperOrchestrator.save_statistics')
+    @patch('src.orchestrator.ScraperOrchestrator.print_summary')
+    def test_run_method(self, mock_print, mock_save, mock_parallel, orchestrator):
+        mock_parallel.return_value = [
+            {'name': 'Test', 'status': 'success', 'items_count': 1,
+             'new_items': 1, 'duration_seconds': 1.0, 'error': None}
+        ]
+        
+        orchestrator.scrapers = [("Test", MagicMock())]
+        orchestrator.run(parallel=True)
+        
+        mock_parallel.assert_called_once_with(None)
+        mock_save.assert_called_once()
+        mock_print.assert_called_once()