diff --git a/.env.production b/.env.production new file mode 100644 index 0000000..d498656 --- /dev/null +++ b/.env.production @@ -0,0 +1,59 @@ +# HVAC Know It All - Production Environment Variables +# Copy to /opt/hvac-kia-content/.env and update with actual values + +# WordPress Configuration +WORDPRESS_USERNAME=your_wordpress_username +WORDPRESS_API_KEY=your_wordpress_api_key +WORDPRESS_BASE_URL=https://hvacknowitall.com + +# YouTube Configuration +YOUTUBE_CHANNEL_URL=https://www.youtube.com/@HVACKnowItAll +YOUTUBE_API_KEY=your_youtube_api_key_optional + +# Instagram Configuration +INSTAGRAM_USERNAME=your_instagram_username +INSTAGRAM_PASSWORD=your_instagram_password + +# TikTok Configuration +TIKTOK_TARGET=@hvacknowitall + +# MailChimp RSS Configuration +MAILCHIMP_RSS_URL=https://us10.campaign-archive.com/feed?u=d1a98c3e62003104038942e21&id=2205dbf985 + +# Podcast RSS Configuration +PODCAST_RSS_URL=https://hvacknowitall.com/podcast/feed/ + +# NAS and Storage Configuration +NAS_PATH=/mnt/nas/hvacknowitall +DATA_DIR=/opt/hvac-kia-content/data +LOGS_DIR=/opt/hvac-kia-content/logs + +# Timezone Configuration +TIMEZONE=America/Halifax + +# Monitoring and Health Checks +HEALTHCHECK_URL=optional_healthcheck_ping_url +MONITORING_ENABLED=true +MONITORING_PORT=8080 + +# Email Notifications (optional) +SMTP_HOST=smtp.gmail.com +SMTP_PORT=587 +SMTP_USERNAME=your_email@gmail.com +SMTP_PASSWORD=your_app_password +ALERT_EMAIL=alerts@hvacknowitall.com + +# Production Settings +ENVIRONMENT=production +DEBUG=false +LOG_LEVEL=INFO + +# Rate Limiting and Performance +MAX_WORKERS=3 +REQUEST_DELAY=1 +MAX_RETRIES=3 + +# Security +USER_AGENT_ROTATION=true +RESPECT_ROBOTS_TXT=true +RATE_LIMIT_ENABLED=true \ No newline at end of file diff --git a/UPDATED_CAPTURE_STATUS.md b/UPDATED_CAPTURE_STATUS.md new file mode 100644 index 0000000..45f26de --- /dev/null +++ b/UPDATED_CAPTURE_STATUS.md @@ -0,0 +1,72 @@ +# HVAC Know It All - Updated Production Backlog Capture + +## šŸš€ Updated Configuration +**Started**: August 18, 2025 @ 10:54 PM ADT + +### šŸ“ˆ New Rate Limits & Targets + +| Source | Previous Target | New Target | Rate Limit | Estimated Time | +|--------|-----------------|------------|------------|----------------| +| **Instagram** | 200 posts | **1000 posts** | 200/hour | ~5 hours | +| **TikTok** | 300 videos | **1000 videos** | Browser-based | ~2-3 hours | + +### ⚔ Instagram Optimization Changes +- **Rate limit**: Increased from 100 to **200 posts/hour** +- **Delays**: Reduced from 15-30s to **10-20 seconds** +- **Extended breaks**: Every **10 requests** (was 5) +- **Break duration**: **30-60 seconds** (was 60-120s) +- **Speed improvement**: ~**40-50% faster** + +### šŸŽÆ TikTok Enhancements +- **Total videos**: 1000 (if available) +- **Videos with captions**: 100 (increased from 50) +- **Caption fetching**: Individual page visits for detailed content + +## šŸ“Š Already Completed Sources + +| Source | Items Captured | File Size | Status | +|--------|---------------|-----------|---------| +| **WordPress** | 139 posts | 1.5 MB | āœ… Complete | +| **Podcast** | 428 episodes | 727 KB | āœ… Complete | +| **YouTube** | 200 videos | 107 KB | āœ… Complete | + +## šŸ”„ Currently Processing +- **Instagram**: Fetching 1000 posts with optimized rate limiting +- **Next**: TikTok with 1000 videos target + +## šŸ“ Output Location +``` +/home/ben/dev/hvac-kia-content/data_production_backlog/markdown_current/ +ā”œā”€ā”€ hvacknowitall_wordpress_backlog_[timestamp].md +ā”œā”€ā”€ hvacknowitall_podcast_backlog_[timestamp].md +ā”œā”€ā”€ hvacknowitall_youtube_backlog_[timestamp].md +ā”œā”€ā”€ hvacknowitall_instagram_backlog_[timestamp].md (pending) +└── hvacknowitall_tiktok_backlog_[timestamp].md (pending) +``` + +## šŸ“ˆ Progress Monitoring +To monitor real-time progress: +```bash +# Watch Instagram progress +tail -f instagram_1000.log + +# Check overall status +./monitor_backlog_progress.sh --live +``` + +## ā±ļø Time Estimates +- **Instagram**: ~5 hours for 1000 posts at 200/hour +- **TikTok**: ~2-3 hours for 1000 videos (depends on caption fetching) +- **Total remaining**: ~7-8 hours + +## šŸŽÆ Final Deliverables +- **~2,767 total items** (767 already + 2000 new) +- **Specification-compliant markdown** for all sources +- **Media files** downloaded and organized +- **NAS synchronization** upon completion + +## šŸ“ Notes +The increased targets will provide a much more comprehensive historical dataset: +- Instagram: 5x more content than originally planned +- TikTok: 3.3x more content than originally planned +- This will capture a significant portion of the brand's social media history \ No newline at end of file diff --git a/automated_backlog_capture.py b/automated_backlog_capture.py new file mode 100644 index 0000000..32d3c0c --- /dev/null +++ b/automated_backlog_capture.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +""" +Automated Production Backlog Capture +Runs without user interaction for automated deployment +""" + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent)) + +from production_backlog_capture import ProductionBacklogCapture +import logging + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('automated_backlog_capture.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +def main(): + """Automated execution without user prompts""" + logger.info("šŸš€ Starting automated production backlog capture") + logger.info("=" * 60) + logger.info("Downloading complete historical content from ALL sources") + logger.info("Including all available media files (images, videos, audio)") + logger.info("=" * 60) + + # Initialize capture + capture = ProductionBacklogCapture() + + # Capture all backlogs automatically + summary = capture.capture_all_backlogs() + + # Sync to NAS if any content was captured + if summary["total_items"] > 0: + nas_success = capture.sync_to_nas() + summary["nas_sync_success"] = nas_success + else: + logger.warning("No content captured - skipping NAS sync") + summary["nas_sync_success"] = False + + # Final summary + logger.info(f"šŸŽ‰ AUTOMATED BACKLOG CAPTURE COMPLETE!") + logger.info(f"šŸ“Š Summary:") + logger.info(f" • Total items captured: {summary['total_items']:,}") + logger.info(f" • Total media files: {summary['total_media_files']:,}") + logger.info(f" • Sources processed: {summary['successful_sources']}/{summary['total_sources']}") + logger.info(f" • Duration: {summary['total_duration']/60:.1f} minutes") + logger.info(f" • NAS sync: {'āœ…' if summary.get('nas_sync_success') else 'āŒ'}") + + return summary["successful_sources"] > 0 + +if __name__ == "__main__": + try: + success = main() + sys.exit(0 if success else 1) + except KeyboardInterrupt: + logger.info("Automated backlog capture interrupted") + sys.exit(1) + except Exception as e: + logger.critical(f"Automated backlog capture failed: {e}") + sys.exit(2) \ No newline at end of file diff --git a/deploy_production.sh b/deploy_production.sh new file mode 100755 index 0000000..1fcbf41 --- /dev/null +++ b/deploy_production.sh @@ -0,0 +1,250 @@ +#!/bin/bash +# +# HVAC Know It All - Production Deployment Script +# Sets up systemd services, directories, and configuration +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Production paths +PROD_DIR="/opt/hvac-kia-content" +SERVICE_USER="hvac-content" +REPO_DIR="$(pwd)" + +# Print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if running as root +check_root() { + if [[ $EUID -ne 0 ]]; then + print_error "This script must be run as root (use sudo)" + exit 1 + fi +} + +# Create service user +setup_user() { + print_status "Setting up service user..." + + if ! id "$SERVICE_USER" &>/dev/null; then + useradd --system --shell /bin/bash --home-dir "$PROD_DIR" --create-home "$SERVICE_USER" + print_success "Created service user: $SERVICE_USER" + else + print_warning "Service user $SERVICE_USER already exists" + fi +} + +# Setup production directory +setup_directories() { + print_status "Setting up production directories..." + + # Create production directory + mkdir -p "$PROD_DIR" + mkdir -p "$PROD_DIR/data" + mkdir -p "$PROD_DIR/logs" + mkdir -p "$PROD_DIR/backups" + mkdir -p "$PROD_DIR/venv" + + # Create NAS mount point (if doesn't exist) + mkdir -p "/mnt/nas/hvacknowitall" + + # Copy application files + cp -r "$REPO_DIR/src" "$PROD_DIR/" + cp -r "$REPO_DIR/config" "$PROD_DIR/" + cp "$REPO_DIR/pyproject.toml" "$PROD_DIR/" + cp "$REPO_DIR/run_production.py" "$PROD_DIR/" + cp "$REPO_DIR/production_backlog_capture.py" "$PROD_DIR/" + cp "$REPO_DIR/automated_backlog_capture.py" "$PROD_DIR/" + + # Copy environment template + if [[ -f "$REPO_DIR/.env.production" ]]; then + cp "$REPO_DIR/.env.production" "$PROD_DIR/.env.template" + print_warning "Remember to configure $PROD_DIR/.env with actual credentials" + fi + + # Set ownership + chown -R "$SERVICE_USER:$SERVICE_USER" "$PROD_DIR" + + print_success "Production directories configured" +} + +# Install Python dependencies +setup_python() { + print_status "Setting up Python environment..." + + # Install uv if not available + if ! command -v uv &> /dev/null; then + print_status "Installing uv package manager..." + curl -LsSf https://astral.sh/uv/install.sh | sh + source ~/.bashrc + fi + + # Switch to service user for Python setup + sudo -u "$SERVICE_USER" bash << EOF +cd "$PROD_DIR" +export PATH="/home/$SERVICE_USER/.local/bin:\$PATH" + +# Create virtual environment and install dependencies +uv venv venv +source venv/bin/activate +uv pip install -e . + +# Install playwright browsers +if uv pip list | grep -q playwright; then + playwright install chromium +fi +EOF + + print_success "Python environment configured" +} + +# Install systemd services +install_services() { + print_status "Installing systemd services..." + + # Copy systemd files + cp "$REPO_DIR/systemd/"*.service /etc/systemd/system/ + cp "$REPO_DIR/systemd/"*.timer /etc/systemd/system/ + + # Update service files with correct paths and user + for service_file in /etc/systemd/system/hvac-*.service; do + sed -i "s|/home/ben/dev/hvac-kia-content|$PROD_DIR|g" "$service_file" + sed -i "s|User=ben|User=$SERVICE_USER|g" "$service_file" + sed -i "s|Group=ben|Group=$SERVICE_USER|g" "$service_file" + done + + # Reload systemd + systemctl daemon-reload + + # Enable services (but don't start yet) + systemctl enable hvac-content-aggregator.timer + systemctl enable hvac-monitoring.timer + systemctl enable hvac-tiktok-captions.timer + + print_success "Systemd services installed and enabled" +} + +# Setup monitoring +setup_monitoring() { + print_status "Setting up monitoring..." + + # Copy monitoring files + cp -r "$REPO_DIR/monitoring" "$PROD_DIR/" + chown -R "$SERVICE_USER:$SERVICE_USER" "$PROD_DIR/monitoring" + + # Create monitoring dashboard + sudo -u "$SERVICE_USER" bash << EOF +cd "$PROD_DIR" +source venv/bin/activate +python monitoring/setup_monitoring.py +EOF + + print_success "Monitoring configured" +} + +# Create logrotate configuration +setup_logrotate() { + print_status "Setting up log rotation..." + + cat > /etc/logrotate.d/hvac-content << EOF +$PROD_DIR/logs/*.log { + daily + missingok + rotate 30 + compress + delaycompress + notifempty + create 644 $SERVICE_USER $SERVICE_USER + postrotate + systemctl reload hvac-content-aggregator.service || true + endscript +} +EOF + + print_success "Log rotation configured" +} + +# Verify installation +verify_installation() { + print_status "Verifying installation..." + + # Check Python environment + if sudo -u "$SERVICE_USER" "$PROD_DIR/venv/bin/python" -c "import src.orchestrator; print('āœ“ Python modules OK')"; then + print_success "Python environment verified" + else + print_error "Python environment verification failed" + return 1 + fi + + # Check systemd services + for service in hvac-content-aggregator hvac-monitoring hvac-tiktok-captions; do + if systemctl is-enabled "${service}.timer" &>/dev/null; then + print_success "Service ${service}.timer is enabled" + else + print_error "Service ${service}.timer is not enabled" + return 1 + fi + done + + # Check directories + for dir in data logs backups; do + if [[ -d "$PROD_DIR/$dir" ]]; then + print_success "Directory $dir exists" + else + print_error "Directory $dir missing" + return 1 + fi + done + + print_success "Installation verification complete" +} + +# Main deployment function +main() { + print_status "Starting HVAC Know It All production deployment..." + echo + + check_root + setup_user + setup_directories + setup_python + install_services + setup_monitoring + setup_logrotate + verify_installation + + echo + print_success "šŸŽ‰ Production deployment complete!" + echo + print_warning "Next steps:" + echo "1. Configure $PROD_DIR/.env with actual credentials" + echo "2. Test the installation: sudo -u $SERVICE_USER $PROD_DIR/venv/bin/python $PROD_DIR/run_production.py --dry-run" + echo "3. Start services: sudo systemctl start hvac-content-aggregator.timer" + echo "4. Monitor logs: sudo journalctl -u hvac-content-aggregator.service -f" + echo "5. Check monitoring dashboard: http://localhost:8080" + echo +} + +# Run main function +main "$@" \ No newline at end of file diff --git a/monitor_backlog_progress.sh b/monitor_backlog_progress.sh new file mode 100755 index 0000000..411050d --- /dev/null +++ b/monitor_backlog_progress.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# Backlog Capture Progress Monitor +# Shows real-time progress of the production backlog capture +# + +# Colors +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +RED='\033[0;31m' +NC='\033[0m' + +LOG_FILE="backlog_capture.log" + +echo "šŸ“Š HVAC Know It All - Backlog Capture Progress Monitor" +echo "==================================================" + +if [[ ! -f "$LOG_FILE" ]]; then + echo "āŒ Log file not found: $LOG_FILE" + exit 1 +fi + +echo "šŸ” Monitoring: $LOG_FILE" +echo "ā° Started: $(date)" +echo + +# Extract progress information +extract_progress() { + local completed_sources=() + local current_source="" + local total_items=0 + local total_media=0 + + # Parse completed sources + while IFS= read -r line; do + if [[ $line =~ āœ…\ ([^:]+):[[:space:]]*([0-9]+)\ items,[[:space:]]*([0-9]+)\ media\ files ]]; then + source_name="${BASH_REMATCH[1]}" + items="${BASH_REMATCH[2]}" + media="${BASH_REMATCH[3]}" + + completed_sources+=("$source_name:$items:$media") + total_items=$((total_items + items)) + total_media=$((total_media + media)) + fi + done < "$LOG_FILE" + + # Find current source + current_source=$(grep "PROCESSING:" "$LOG_FILE" | tail -1 | sed 's/.*PROCESSING: //' | tr -d '\r') + + # Display progress + echo -e "${BLUE}šŸ“ˆ PROGRESS SUMMARY${NC}" + echo "===================" + + if [[ ${#completed_sources[@]} -gt 0 ]]; then + echo -e "${GREEN}āœ… Completed Sources:${NC}" + for source_info in "${completed_sources[@]}"; do + IFS=':' read -r name items media <<< "$source_info" + printf " %-12s: %4s items, %3s media files\n" "$name" "$items" "$media" + done + echo + echo -e "${GREEN}šŸ“Š Totals so far: $total_items items, $total_media media files${NC}" + else + echo -e "${YELLOW}ā³ No sources completed yet${NC}" + fi + + if [[ -n "$current_source" ]]; then + echo + echo -e "${BLUE}šŸ”„ Currently Processing: ${YELLOW}$current_source${NC}" + + # Show last few progress lines for current source + echo -e "${BLUE}Recent activity:${NC}" + grep -E "(Starting|Fetching|Downloaded|Processing)" "$LOG_FILE" | tail -3 | while read -r line; do + timestamp=$(echo "$line" | cut -d' ' -f1-2) + message=$(echo "$line" | sed 's/^[^-]*- [^-]* - [^-]* - //') + echo " $timestamp: $message" + done + fi + + # Check if complete + if grep -q "AUTOMATED BACKLOG CAPTURE COMPLETE" "$LOG_FILE"; then + echo + echo -e "${GREEN}šŸŽ‰ BACKLOG CAPTURE COMPLETE!${NC}" + + # Extract final summary + if grep -q "Total items captured:" "$LOG_FILE"; then + final_items=$(grep "Total items captured:" "$LOG_FILE" | tail -1 | sed 's/.*Total items captured: //' | sed 's/,//') + final_media=$(grep "Total media files:" "$LOG_FILE" | tail -1 | sed 's/.*Total media files: //' | sed 's/,//') + duration=$(grep "Duration:" "$LOG_FILE" | tail -1 | sed 's/.*Duration: //') + + echo -e "${GREEN}šŸ“Š Final Results:${NC}" + echo " Total items: $final_items" + echo " Total media: $final_media" + echo " Duration: $duration" + fi + + return 0 + fi + + # Check for errors + local error_count=$(grep -c "āŒ\|ERROR\|failed" "$LOG_FILE" 2>/dev/null || echo "0") + if [[ $error_count -gt 0 ]]; then + echo + echo -e "${RED}āš ļø Errors detected: $error_count${NC}" + echo " Last error:" + grep -E "āŒ|ERROR|failed" "$LOG_FILE" | tail -1 | sed 's/^[^-]*- / /' + fi + + return 1 +} + +# Show current progress +extract_progress +capture_complete=$? + +echo +echo "==================================================" + +# Live monitoring option +if [[ "$1" == "--live" ]]; then + echo "šŸ“” Starting live monitoring (Ctrl+C to stop)..." + echo + + # Monitor in real-time + while [[ $capture_complete -ne 0 ]]; do + sleep 10 + clear + echo "šŸ“Š HVAC Know It All - Live Progress Monitor" + echo "==================================================" + echo "šŸ” Monitoring: $LOG_FILE" + echo "ā° Updated: $(date)" + echo + + extract_progress + capture_complete=$? + + if [[ $capture_complete -eq 0 ]]; then + break + fi + + echo + echo "šŸ”„ Refreshing in 10 seconds... (Ctrl+C to stop)" + done + + echo + echo "šŸŽ‰ Monitoring complete!" +else + echo "šŸ’” Tip: Use '$0 --live' for real-time monitoring" +fi \ No newline at end of file diff --git a/production_backlog_capture.py b/production_backlog_capture.py new file mode 100755 index 0000000..f6b6794 --- /dev/null +++ b/production_backlog_capture.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +""" +Production Backlog Capture Script + +This script performs a comprehensive backlog download for ALL sources +with full media file downloading and NAS synchronization. + +Features: +- Downloads complete historical content from all sources +- Captures all available media files (images, videos, audio) +- Organizes content by source and date +- Syncs everything to NAS +- Provides detailed progress reporting +- Handles errors gracefully with retry logic +""" + +import os +import sys +import time +import json +from pathlib import Path +from datetime import datetime +import logging +from typing import Dict, Any + +# Add project to path +sys.path.insert(0, str(Path(__file__).parent)) + +from src.orchestrator import ContentOrchestrator +from src.base_scraper import ScraperConfig + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('production_backlog_capture.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + + +class ProductionBacklogCapture: + """Handles comprehensive backlog capture for production deployment""" + + def __init__(self, data_dir: Path = None): + self.data_dir = data_dir or Path("data_production_backlog") + self.logs_dir = Path("logs_production_backlog") + self.start_time = time.time() + + # Create directories + self.data_dir.mkdir(parents=True, exist_ok=True) + self.logs_dir.mkdir(parents=True, exist_ok=True) + + # Initialize orchestrator + self.orchestrator = ContentOrchestrator(self.data_dir, self.logs_dir) + + # Track results + self.results = {} + + def capture_source_backlog(self, source_name: str, max_items: int = None) -> Dict[str, Any]: + """Capture complete backlog for a specific source""" + logger.info(f"Starting backlog capture for {source_name}...") + + start_time = time.time() + + try: + scraper = self.orchestrator.scrapers.get(source_name) + if not scraper: + logger.error(f"Scraper not found: {source_name}") + return {"success": False, "error": "Scraper not found", "items": 0} + + # Clear state for full backlog + if scraper.state_file.exists(): + scraper.state_file.unlink() + logger.info(f"Cleared state for {source_name} - full backlog mode") + + # Fetch content with special handling for each source + if source_name == "tiktok": + # TikTok with captions for first 100 videos when fetching 1000 + caption_count = min(100, max_items // 10) if max_items else 50 + items = scraper.fetch_content( + max_posts=max_items or 200, + fetch_captions=True, + max_caption_fetches=caption_count + ) + elif source_name == "youtube": + items = scraper.fetch_channel_videos(max_videos=max_items or 100) + elif source_name == "instagram": + items = scraper.fetch_content(max_posts=max_items or 100) + else: + # RSS sources + items = scraper.fetch_content(max_items=max_items) + + if not items: + logger.warning(f"No items fetched for {source_name}") + return {"success": True, "items": 0, "duration": time.time() - start_time} + + logger.info(f"Fetched {len(items)} items for {source_name}") + + # Download media files for items with media + media_downloaded = 0 + for i, item in enumerate(items): + if i % 10 == 0: + logger.info(f"Processing media for {source_name}: {i}/{len(items)}") + + # Download media based on item type + media_urls = [] + + # Extract media URLs from various fields + if 'image' in item and item['image']: + media_urls.append((item['image'], 'image')) + if 'thumbnail' in item and item['thumbnail']: + media_urls.append((item['thumbnail'], 'image')) + if 'video_url' in item and item['video_url']: + media_urls.append((item['video_url'], 'video')) + if 'audio_link' in item and item['audio_link']: + media_urls.append((item['audio_link'], 'audio')) + + # Download each media file + for url, media_type in media_urls: + try: + local_path = scraper.download_media(url, item.get('id', f'item_{i}'), media_type) + if local_path: + media_downloaded += 1 + # Add local path to item + if 'local_media' not in item: + item['local_media'] = [] + item['local_media'].append(local_path) + except Exception as e: + logger.warning(f"Failed to download media {url}: {e}") + + logger.info(f"Downloaded {media_downloaded} media files for {source_name}") + + # Generate and save markdown + markdown = scraper.format_markdown(items) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"hvacknowitall_{source_name}_backlog_{timestamp}.md" + + # Save to current directory + current_dir = scraper.config.data_dir / "markdown_current" + current_dir.mkdir(parents=True, exist_ok=True) + output_file = current_dir / filename + output_file.write_text(markdown, encoding='utf-8') + + # Update state + new_state = { + 'last_update': datetime.now().isoformat(), + 'last_item_count': len(items), + 'backlog_captured': True, + 'backlog_timestamp': timestamp + } + + if items: + new_state['last_id'] = items[-1].get('id') + + scraper.save_state(new_state) + + duration = time.time() - start_time + logger.info(f"āœ… {source_name}: {len(items)} items, {media_downloaded} media files in {duration:.1f}s") + + return { + "success": True, + "items": len(items), + "media_files": media_downloaded, + "duration": duration, + "output_file": str(output_file) + } + + except Exception as e: + duration = time.time() - start_time + logger.error(f"āŒ {source_name} failed after {duration:.1f}s: {e}") + return { + "success": False, + "error": str(e), + "items": 0, + "duration": duration + } + + def capture_all_backlogs(self) -> Dict[str, Any]: + """Capture backlogs for all sources""" + logger.info("=" * 80) + logger.info("STARTING PRODUCTION BACKLOG CAPTURE") + logger.info("=" * 80) + + # Source configurations with appropriate limits + sources_config = { + "wordpress": {"max_items": None}, # All posts + "mailchimp": {"max_items": None}, # All available (limited by RSS) + "podcast": {"max_items": None}, # All episodes + "youtube": {"max_items": 200}, # Last 200 videos + "instagram": {"max_items": 200}, # Last 200 posts + "tiktok": {"max_items": 300} # 300 videos with captions for first 50 + } + + total_items = 0 + total_media = 0 + successful_sources = 0 + + for source_name, config in sources_config.items(): + logger.info(f"\n{'-'*60}") + logger.info(f"PROCESSING: {source_name.upper()}") + logger.info(f"{'-'*60}") + + result = self.capture_source_backlog(source_name, config["max_items"]) + self.results[source_name] = result + + if result["success"]: + successful_sources += 1 + total_items += result["items"] + total_media += result.get("media_files", 0) + + # Add delay between sources to be respectful + if source_name != list(sources_config.keys())[-1]: # Not last source + logger.info("Waiting 30 seconds before next source...") + time.sleep(30) + + # Generate summary + total_duration = time.time() - self.start_time + + summary = { + "timestamp": datetime.now().isoformat(), + "total_duration": total_duration, + "total_items": total_items, + "total_media_files": total_media, + "successful_sources": successful_sources, + "total_sources": len(sources_config), + "results": self.results + } + + # Save summary + summary_file = self.data_dir / f"backlog_capture_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + with open(summary_file, 'w') as f: + json.dump(summary, f, indent=2) + + logger.info("\n" + "=" * 80) + logger.info("BACKLOG CAPTURE COMPLETE") + logger.info("=" * 80) + logger.info(f"Total items: {total_items:,}") + logger.info(f"Total media files: {total_media:,}") + logger.info(f"Successful sources: {successful_sources}/{len(sources_config)}") + logger.info(f"Total duration: {total_duration/60:.1f} minutes") + logger.info(f"Summary saved: {summary_file}") + + return summary + + def sync_to_nas(self) -> bool: + """Sync all captured data to NAS""" + logger.info("\n" + "=" * 60) + logger.info("SYNCING TO NAS") + logger.info("=" * 60) + + try: + success = self.orchestrator.sync_to_nas() + if success: + logger.info("āœ… NAS sync completed successfully") + else: + logger.error("āŒ NAS sync failed") + return success + except Exception as e: + logger.error(f"āŒ NAS sync error: {e}") + return False + + +def main(): + """Main execution function""" + print("šŸš€ HVAC Know It All - Production Backlog Capture") + print("=" * 60) + print("This will download complete historical content from ALL sources") + print("Including all available media files (images, videos, audio)") + print("Estimated time: 2-4 hours depending on content volume") + print("=" * 60) + + response = input("Proceed with full backlog capture? (y/N): ") + if response.lower() != 'y': + print("Backlog capture cancelled.") + return False + + # Initialize capture + capture = ProductionBacklogCapture() + + # Capture all backlogs + summary = capture.capture_all_backlogs() + + # Sync to NAS if any content was captured + if summary["total_items"] > 0: + nas_success = capture.sync_to_nas() + summary["nas_sync_success"] = nas_success + else: + logger.warning("No content captured - skipping NAS sync") + summary["nas_sync_success"] = False + + # Final summary + print(f"\nšŸŽ‰ PRODUCTION BACKLOG CAPTURE COMPLETE!") + print(f"šŸ“Š Summary:") + print(f" • Total items captured: {summary['total_items']:,}") + print(f" • Total media files: {summary['total_media_files']:,}") + print(f" • Sources processed: {summary['successful_sources']}/{summary['total_sources']}") + print(f" • Duration: {summary['total_duration']/60:.1f} minutes") + print(f" • NAS sync: {'āœ…' if summary.get('nas_sync_success') else 'āŒ'}") + + return summary["successful_sources"] > 0 + + +if __name__ == "__main__": + try: + success = main() + sys.exit(0 if success else 1) + except KeyboardInterrupt: + print("\n\nBacklog capture interrupted by user") + sys.exit(1) + except Exception as e: + logger.critical(f"Backlog capture failed: {e}") + sys.exit(2) \ No newline at end of file diff --git a/resume_instagram_capture.py b/resume_instagram_capture.py new file mode 100644 index 0000000..7a038e6 --- /dev/null +++ b/resume_instagram_capture.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +""" +Resume Instagram and TikTok capture with updated rate limits +""" + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent)) + +from production_backlog_capture import ProductionBacklogCapture +import logging + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('instagram_resume.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +def main(): + """Resume Instagram and TikTok capture""" + logger.info("šŸš€ Resuming Instagram capture with updated rate limits") + logger.info("New settings: 200 posts/hour, 10-20 second delays") + logger.info("=" * 60) + + # Initialize capture with existing data directory + capture = ProductionBacklogCapture(Path("data_production_backlog")) + + # Capture Instagram with updated settings (already has 40 posts fetched) + logger.info("Starting Instagram capture - targeting 1000 posts...") + instagram_result = capture.capture_source_backlog("instagram", 1000) + + if instagram_result["success"]: + logger.info(f"āœ… Instagram completed: {instagram_result['items']} items") + + # Continue with TikTok + logger.info("\nStarting TikTok capture with captions - targeting 1000 videos...") + tiktok_result = capture.capture_source_backlog("tiktok", 1000) + + if tiktok_result["success"]: + logger.info(f"āœ… TikTok completed: {tiktok_result['items']} items") + else: + logger.error(f"āŒ TikTok failed: {tiktok_result.get('error', 'Unknown error')}") + else: + logger.error(f"āŒ Instagram failed: {instagram_result.get('error', 'Unknown error')}") + + # Sync to NAS if successful + if instagram_result.get("success") or tiktok_result.get("success"): + logger.info("\nSyncing to NAS...") + nas_success = capture.sync_to_nas() + logger.info(f"NAS sync: {'āœ…' if nas_success else 'āŒ'}") + + # Summary + logger.info("\n" + "=" * 60) + logger.info("šŸ“Š CAPTURE SUMMARY") + logger.info(f"Instagram: {instagram_result.get('items', 0)} items") + logger.info(f"TikTok: {tiktok_result.get('items', 0)} items") + + return True + +if __name__ == "__main__": + try: + success = main() + sys.exit(0 if success else 1) + except KeyboardInterrupt: + logger.info("\nCapture interrupted by user") + sys.exit(1) + except Exception as e: + logger.critical(f"Capture failed: {e}") + sys.exit(2) \ No newline at end of file diff --git a/src/instagram_scraper.py b/src/instagram_scraper.py index d74ed2c..af98869 100644 --- a/src/instagram_scraper.py +++ b/src/instagram_scraper.py @@ -27,7 +27,7 @@ class InstagramScraper(BaseScraper): # Request counter for rate limiting self.request_count = 0 - self.max_requests_per_hour = 100 # Updated to 100 requests per hour + self.max_requests_per_hour = 200 # Updated to 200 requests per hour for faster fetching def _setup_loader(self) -> instaloader.Instaloader: """Setup Instaloader with conservative settings.""" @@ -80,7 +80,7 @@ class InstagramScraper(BaseScraper): # Create a new loader instance which should have context self.loader = instaloader.Instaloader() - def _aggressive_delay(self, min_seconds: float = 15, max_seconds: float = 30) -> None: + def _aggressive_delay(self, min_seconds: float = 10, max_seconds: float = 20) -> None: """Add aggressive random delay for Instagram.""" delay = random.uniform(min_seconds, max_seconds) self.logger.debug(f"Waiting {delay:.2f} seconds (Instagram rate limiting)...") @@ -94,10 +94,10 @@ class InstagramScraper(BaseScraper): self.logger.warning(f"Rate limit reached ({self.max_requests_per_hour} requests), pausing for 1 hour...") time.sleep(3600) # Wait 1 hour self.request_count = 0 - elif self.request_count % 5 == 0: - # Take a longer break every 5 requests - self.logger.info("Taking extended break after 5 requests...") - self._aggressive_delay(60, 120) # 1-2 minute break + elif self.request_count % 10 == 0: + # Take a longer break every 10 requests + self.logger.info("Taking extended break after 10 requests...") + self._aggressive_delay(30, 60) # 30-60 second break def _get_post_type(self, post) -> str: """Determine post type from Instagram post object.""" diff --git a/validate_production.sh b/validate_production.sh new file mode 100755 index 0000000..5746b0b --- /dev/null +++ b/validate_production.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# +# Production Validation Script +# Tests all production components and services +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +PROD_DIR="/opt/hvac-kia-content" +SERVICE_USER="hvac-content" + +# Print colored output +print_status() { echo -e "${BLUE}[INFO]${NC} $1"; } +print_success() { echo -e "${GREEN}[āœ“]${NC} $1"; } +print_warning() { echo -e "${YELLOW}[!]${NC} $1"; } +print_error() { echo -e "${RED}[āœ—]${NC} $1"; } + +# Test counters +TESTS_PASSED=0 +TESTS_FAILED=0 + +# Test function wrapper +run_test() { + local test_name="$1" + local test_command="$2" + + echo -n "Testing $test_name... " + + if eval "$test_command" >/dev/null 2>&1; then + print_success "$test_name" + ((TESTS_PASSED++)) + return 0 + else + print_error "$test_name" + ((TESTS_FAILED++)) + return 1 + fi +} + +# Test production directory structure +test_directories() { + print_status "Validating directory structure..." + + run_test "Production directory exists" "[[ -d '$PROD_DIR' ]]" + run_test "Data directory exists" "[[ -d '$PROD_DIR/data' ]]" + run_test "Logs directory exists" "[[ -d '$PROD_DIR/logs' ]]" + run_test "Source code exists" "[[ -d '$PROD_DIR/src' ]]" + run_test "Config directory exists" "[[ -d '$PROD_DIR/config' ]]" + run_test "Virtual environment exists" "[[ -d '$PROD_DIR/venv' ]]" +} + +# Test service user +test_service_user() { + print_status "Validating service user..." + + run_test "Service user exists" "id '$SERVICE_USER'" + run_test "Service user home directory" "[[ -d '/home/$SERVICE_USER' || '$PROD_DIR' ]]" + run_test "Production directory ownership" "[[ \$(stat -c '%U' '$PROD_DIR') == '$SERVICE_USER' ]]" +} + +# Test Python environment +test_python_environment() { + print_status "Validating Python environment..." + + run_test "Python virtual environment" "[[ -f '$PROD_DIR/venv/bin/python' ]]" + run_test "Python packages installed" "sudo -u '$SERVICE_USER' '$PROD_DIR/venv/bin/pip' list | grep -q 'requests'" + run_test "Source modules importable" "sudo -u '$SERVICE_USER' '$PROD_DIR/venv/bin/python' -c 'import src.orchestrator'" + run_test "Playwright browser installed" "sudo -u '$SERVICE_USER' '$PROD_DIR/venv/bin/python' -c 'from playwright.sync_api import sync_playwright; sync_playwright().chromium.executable_path'" +} + +# Test systemd services +test_systemd_services() { + print_status "Validating systemd services..." + + run_test "Main service file exists" "[[ -f '/etc/systemd/system/hvac-content-aggregator.service' ]]" + run_test "Main timer file exists" "[[ -f '/etc/systemd/system/hvac-content-aggregator.timer' ]]" + run_test "Monitoring service exists" "[[ -f '/etc/systemd/system/hvac-monitoring.service' ]]" + run_test "TikTok service exists" "[[ -f '/etc/systemd/system/hvac-tiktok-captions.service' ]]" + run_test "Main timer enabled" "systemctl is-enabled hvac-content-aggregator.timer" + run_test "Monitoring timer enabled" "systemctl is-enabled hvac-monitoring.timer" +} + +# Test configuration files +test_configuration() { + print_status "Validating configuration..." + + run_test "Production config exists" "[[ -f '$PROD_DIR/config/production.py' ]]" + run_test "Environment template exists" "[[ -f '$PROD_DIR/.env.template' ]]" + run_test "Main runner script exists" "[[ -f '$PROD_DIR/run_production.py' ]]" + run_test "Backlog capture script exists" "[[ -f '$PROD_DIR/production_backlog_capture.py' ]]" +} + +# Test network connectivity +test_connectivity() { + print_status "Validating network connectivity..." + + run_test "HVAC Know It All website reachable" "curl -s --max-time 10 https://hvacknowitall.com > /dev/null" + run_test "YouTube accessible" "curl -s --max-time 10 https://www.youtube.com/@HVACKnowItAll > /dev/null" + run_test "MailChimp RSS accessible" "curl -s --max-time 10 'https://us10.campaign-archive.com/feed?u=d1a98c3e62003104038942e21&id=2205dbf985' > /dev/null" +} + +# Test dry run execution +test_dry_run() { + print_status "Validating application execution..." + + if [[ -f "$PROD_DIR/.env" ]]; then + run_test "Production script dry run" "sudo -u '$SERVICE_USER' bash -c 'cd $PROD_DIR && source venv/bin/activate && timeout 30 python run_production.py --dry-run'" + else + print_warning "Skipping dry run test - .env file not configured" + fi +} + +# Test log rotation +test_log_rotation() { + print_status "Validating log rotation..." + + run_test "Logrotate configuration exists" "[[ -f '/etc/logrotate.d/hvac-content' ]]" + run_test "Logrotate configuration syntax" "logrotate -d /etc/logrotate.d/hvac-content" +} + +# Test monitoring +test_monitoring() { + print_status "Validating monitoring setup..." + + run_test "Monitoring directory exists" "[[ -d '$PROD_DIR/monitoring' ]]" + run_test "Dashboard generator exists" "[[ -f '$PROD_DIR/monitoring/dashboard_generator.py' ]]" +} + +# Main validation function +main() { + echo "šŸ” HVAC Know It All - Production Validation" + echo "==========================================" + echo + + test_directories + echo + test_service_user + echo + test_python_environment + echo + test_systemd_services + echo + test_configuration + echo + test_connectivity + echo + test_dry_run + echo + test_log_rotation + echo + test_monitoring + + echo + echo "==========================================" + if [[ $TESTS_FAILED -eq 0 ]]; then + print_success "šŸŽ‰ All tests passed! ($TESTS_PASSED/$((TESTS_PASSED + TESTS_FAILED)))" + print_status "Production environment is ready for deployment" + exit 0 + else + print_error "āš ļø Some tests failed: $TESTS_FAILED/$((TESTS_PASSED + TESTS_FAILED))" + print_status "Please address the failed tests before proceeding" + exit 1 + fi +} + +# Show help +if [[ "$1" == "--help" || "$1" == "-h" ]]; then + echo "HVAC Know It All - Production Validation Script" + echo + echo "Usage: $0 [options]" + echo + echo "This script validates the production deployment by testing:" + echo " • Directory structure and permissions" + echo " • Service user configuration" + echo " • Python environment and dependencies" + echo " • Systemd services and timers" + echo " • Configuration files" + echo " • Network connectivity" + echo " • Application execution (dry run)" + echo " • Log rotation setup" + echo " • Monitoring configuration" + echo + echo "Options:" + echo " -h, --help Show this help message" + echo + exit 0 +fi + +# Run validation +main "$@" \ No newline at end of file