#!/usr/bin/env python3 """ Create incremental Instagram markdown file from running process without losing progress. This script safely generates output from whatever the running Instagram scraper has collected so far. """ import os import sys import time from pathlib import Path from datetime import datetime import pytz from dotenv import load_dotenv # Add src to path sys.path.insert(0, str(Path(__file__).parent / 'src')) from base_scraper import ScraperConfig from instagram_scraper import InstagramScraper def create_incremental_output(): """Create incremental output without interfering with running process.""" print("=== INSTAGRAM INCREMENTAL OUTPUT ===") print("Safely creating incremental markdown without stopping running process") print() # Load environment load_dotenv() # Check if Instagram scraper is running import subprocess result = subprocess.run( ["ps", "aux"], capture_output=True, text=True ) instagram_running = False for line in result.stdout.split('\n'): if 'instagram_scraper' in line.lower() and 'python' in line and 'grep' not in line: instagram_running = True print(f"✓ Found running Instagram scraper: {line.strip()}") break if not instagram_running: print("⚠️ No running Instagram scraper detected") print(" This script is designed to work with a running scraper process") return # Get Atlantic timezone timestamp tz = pytz.timezone('America/Halifax') now = datetime.now(tz) timestamp = now.strftime('%Y-%m-%dT%H%M%S') print(f"Creating incremental output at: {now.strftime('%Y-%m-%d %H:%M:%S %Z')}") print() # Setup config - use temporary session to avoid conflicts config = ScraperConfig( source_name='instagram_incremental', brand_name='hvacnkowitall', data_dir=Path('data'), logs_dir=Path('logs'), timezone='America/Halifax' ) try: # Create a separate scraper instance with different session scraper = InstagramScraper(config) # Override session file to avoid conflicts with running process scraper.session_file = scraper.session_file.parent / f'{scraper.username}_incremental.session' print("Initializing separate Instagram connection for incremental output...") # Try to create incremental output with limited posts to avoid rate limiting conflicts print("Fetching recent posts for incremental output (max 20 to avoid conflicts)...") # Fetch a small number of recent posts items = scraper.fetch_content(max_posts=20) if items: # Format as markdown markdown_content = scraper.format_markdown(items) # Save with incremental naming output_file = Path('data/markdown_current') / f'hvacnkowitall_instagram_incremental_{timestamp}.md' output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(markdown_content, encoding='utf-8') print() print("=" * 60) print("INSTAGRAM INCREMENTAL OUTPUT CREATED") print("=" * 60) print(f"Posts captured: {len(items)}") print(f"Output file: {output_file}") print("=" * 60) print() print("NOTE: This is a sample of recent posts.") print("The main backlog process is still running and will create") print("a complete file with all 1000 posts when finished.") else: print("❌ No Instagram posts captured for incremental output") print(" This may be due to rate limiting or session conflicts") print(" The main backlog process should continue normally") except Exception as e: print(f"❌ Error creating incremental output: {e}") print() print("This is expected if the main Instagram process is using") print("all available API quota. The main process will continue") print("and create the complete output when finished.") print() print("To check progress of the main process:") print(" tail -f logs/instagram.log") if __name__ == "__main__": create_incremental_output()