hvac-kia-content/src/base_scraper.py

import json
import logging
import shutil
import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from logging.handlers import RotatingFileHandler
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse, unquote

import pytz
import requests
from markitdown import MarkItDown
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type


@dataclass
class ScraperConfig:
    source_name: str
    brand_name: str
    data_dir: Path
    logs_dir: Path
    timezone: str = "America/Halifax"


class BaseScraper(ABC):
    def __init__(self, config: ScraperConfig):
        self.config = config
        self.state_file = config.data_dir / ".state" / f"{config.source_name}_state.json"
        self.tz = pytz.timezone(config.timezone)
        self.converter = MarkItDown()

        # HTTP Session for connection pooling
        self.session = requests.Session()

        # User agent rotation pool
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
            'HVAC-KnowItAll-Bot/1.0 (+https://hvacknowitall.com)'  # Fallback bot UA
        ]
        self.current_ua_index = 0

        # Retry configuration from production config
        self.retry_config = {
            "max_attempts": 3,
            "initial_delay": 5,
            "backoff_factor": 2,
            "max_delay": 60
        }

        # Ensure directories exist BEFORE setting up logger
        self.state_file.parent.mkdir(parents=True, exist_ok=True)
        (config.data_dir / "markdown_current").mkdir(parents=True, exist_ok=True)
        (config.data_dir / "markdown_archives" / config.source_name.title()).mkdir(parents=True, exist_ok=True)
        (config.data_dir / "media" / config.source_name.title()).mkdir(parents=True, exist_ok=True)
        (config.logs_dir / config.source_name.title()).mkdir(parents=True, exist_ok=True)

        # Now setup logger after directories exist
        self.logger = self._setup_logger()

        # Set initial user agent (after logger is set up)
        self.rotate_user_agent()

    def _setup_logger(self) -> logging.Logger:
        logger = logging.getLogger(f"{self.config.brand_name}_{self.config.source_name}")
        logger.setLevel(logging.DEBUG)

        # Console handler
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)

        # File handler with rotation
        log_file = self.config.logs_dir / self.config.source_name.title() / f"{self.config.source_name}.log"
        file_handler = RotatingFileHandler(
            log_file,
            maxBytes=10 * 1024 * 1024,  # 10MB
            backupCount=5
        )
        file_handler.setLevel(logging.DEBUG)

        # Formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        console_handler.setFormatter(formatter)
        file_handler.setFormatter(formatter)

        logger.addHandler(console_handler)
        logger.addHandler(file_handler)

        return logger

    def get_retry_decorator(self):
        """Get a configured retry decorator for HTTP requests"""
        return retry(
            stop=stop_after_attempt(self.retry_config["max_attempts"]),
            wait=wait_exponential(
                multiplier=self.retry_config["backoff_factor"],
                min=self.retry_config["initial_delay"],
                max=self.retry_config["max_delay"]
            ),
            retry=retry_if_exception_type((requests.RequestException, ConnectionError, TimeoutError)),
            before_sleep=lambda retry_state: self.logger.warning(
                f"Retry attempt {retry_state.attempt_number} after {retry_state.next_action.sleep} seconds"
            )
        )

    def make_request(self, *args, **kwargs):
        """Make an HTTP request with retry logic, connection pooling, and user agent rotation"""
        # Rotate user agent every 5 requests to avoid detection
        import random
        if random.randint(1, 5) == 1:
            self.rotate_user_agent()

        @self.get_retry_decorator()
        def _make_request():
            return self.session.request(*args, **kwargs)

        return _make_request()

    def rotate_user_agent(self):
        """Rotate to the next user agent in the pool"""
        self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents)
        user_agent = self.user_agents[self.current_ua_index]
        self.session.headers.update({'User-Agent': user_agent})
        self.logger.debug(f"Rotated to user agent: {user_agent[:50]}...")

    def load_state(self) -> Dict[str, Any]:
        if not self.state_file.exists():
            self.logger.info(f"No state file found at {self.state_file}, starting fresh")
            return {}

        try:
            with open(self.state_file, 'r') as f:
                state = json.load(f)
                self.logger.debug(f"Loaded state: {state}")
                return state
        except Exception as e:
            self.logger.error(f"Error loading state: {e}")
            return {}

    def save_state(self, state: Dict[str, Any]) -> None:
        try:
            self.state_file.parent.mkdir(parents=True, exist_ok=True)
            with open(self.state_file, 'w') as f:
                json.dump(state, f, indent=2)
            self.logger.debug(f"Saved state: {state}")
        except Exception as e:
            self.logger.error(f"Error saving state: {e}")

    def generate_filename(self) -> str:
        now = datetime.now(self.tz)
        timestamp = now.strftime("%Y-%d-%m-T%H%M%S")
        return f"{self.config.brand_name}_{self.config.source_name}_{timestamp}.md"

    def archive_current_file(self) -> None:
        current_dir = self.config.data_dir / "markdown_current"
        archive_dir = self.config.data_dir / "markdown_archives" / self.config.source_name.title()

        pattern = f"{self.config.brand_name}_{self.config.source_name}_*.md"
        current_files = list(current_dir.glob(pattern))

        for file in current_files:
            archive_path = archive_dir / file.name
            try:
                shutil.move(str(file), str(archive_path))
                self.logger.info(f"Archived {file.name} to {archive_dir}")
            except Exception as e:
                self.logger.error(f"Error archiving {file.name}: {e}")

    def convert_to_markdown(self, content: str, content_type: str = "text/html") -> str:
        try:
            if content_type == "text/html":
                # Use markdownify for HTML conversion - it handles Unicode properly
                from markdownify import markdownify as md
                import re

                # First, clean the HTML content
                # Remove script blocks and their content completely
                content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL | re.IGNORECASE)

                # Remove style blocks and their content completely
                content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE)

                # Remove inline JavaScript event handlers
                content = re.sub(r'\s*on\w+\s*=\s*"[^"]*"', '', content, flags=re.IGNORECASE)
                content = re.sub(r"\s*on\w+\s*=\s*'[^']*'", '', content, flags=re.IGNORECASE)

                # Convert HTML to Markdown with sensible defaults
                markdown = md(content,
                             heading_style="ATX",  # Use # for headings
                             bullets="-",  # Use - for bullet points
                             strip=["script", "style", "meta", "link", "noscript"])  # Remove these tags completely

                # Post-process to clean up any remaining issues
                # Remove any remaining HTML tags that shouldn't be in markdown
                markdown = re.sub(r'<br\s*/?>', '\n', markdown, flags=re.IGNORECASE)

                # Clean up excessive blank lines
                markdown = re.sub(r'\n{3,}', '\n\n', markdown)

                # Fix malformed comparison operators that look like tags
                markdown = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', markdown)

                return markdown.strip()
            else:
                # For other content types, return as-is
                return content
        except ImportError:
            # Fall back to MarkItDown if markdownify is not available
            try:
                if content_type == "text/html":
                    # Use file-based conversion which handles Unicode better
                    import tempfile
                    import os

                    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8',
                                                    suffix='.html', delete=False) as f:
                        f.write(content)
                        temp_path = f.name

                    try:
                        result = self.converter.convert(temp_path)
                        return result.text_content if hasattr(result, 'text_content') else str(result)
                    finally:
                        os.unlink(temp_path)
                else:
                    return content
            except Exception as e:
                self.logger.error(f"Error converting to markdown: {e}")
                return content
        except Exception as e:
            self.logger.error(f"Error converting to markdown: {e}")
            # Fall back to returning the content as-is
            return content

    def save_markdown(self, content: str) -> Path:
        self.archive_current_file()

        filename = self.generate_filename()
        filepath = self.config.data_dir / "markdown_current" / filename

        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)
            self.logger.info(f"Saved markdown to {filepath}")
            return filepath
        except Exception as e:
            self.logger.error(f"Error saving markdown: {e}")
            raise

    def download_media(self, url: str, filename: str) -> Optional[Path]:
        media_dir = self.config.data_dir / "media" / self.config.source_name.title()
        filepath = media_dir / filename

        # Implementation would download the file
        # Placeholder for now
        self.logger.debug(f"Would download {url} to {filepath}")
        return filepath

    @abstractmethod
    def fetch_content(self) -> List[Dict[str, Any]]:
        pass

    def format_markdown(self, items: List[Dict[str, Any]]) -> str:
        """Format items according to specification markdown format."""
        if not items:
            return ""

        formatted_items = []
        for item in items:
            # Use spec-compliant format
            formatted_item = self.format_item_to_spec(item)
            formatted_items.append(formatted_item)

        return "\n\n--------------\n\n".join(formatted_items)

    def format_item_to_spec(self, item: Dict[str, Any]) -> str:
        """Format a single item according to the specification format."""
        lines = []

        # ID (required)
        item_id = item.get('id', item.get('url', 'unknown'))
        lines.append(f"# ID: {item_id}")
        lines.append("")

        # Title (required)
        title = item.get('title', 'Untitled')
        lines.append(f"## Title: {title}")
        lines.append("")

        # Type (required)
        content_type = item.get('type', self.config.source_name)
        lines.append(f"## Type: {content_type}")
        lines.append("")

        # Permalink (required)
        permalink = item.get('url', item.get('link', 'N/A'))
        lines.append(f"## Permalink: {permalink}")
        lines.append("")

        # Description (required)
        description = item.get('description', item.get('content', ''))
        if isinstance(description, list):
            description = ' '.join(description)
        # Clean up description
        description = description.strip() if description else 'No description available'
        lines.append("## Description:")
        lines.append(description)
        lines.append("")

        # Metadata section
        lines.append("## Metadata:")
        lines.append("")

        # Comments
        comments = item.get('comments', item.get('comment_count', 0))
        lines.append(f"### Comments: {comments}")
        lines.append("")

        # Likes
        likes = item.get('likes', item.get('like_count', 0))
        lines.append(f"### Likes: {likes}")
        lines.append("")

        # Tags
        tags = item.get('tags', item.get('categories', []))
        if tags:
            lines.append("### Tags:")
            for tag in tags:
                tag_name = tag if isinstance(tag, str) else tag.get('name', str(tag))
                lines.append(f"- {tag_name}")
        else:
            lines.append("### Tags:")
            lines.append("- No tags")

        # Additional metadata (optional)
        if 'views' in item:
            lines.append("")
            lines.append(f"### Views: {item['views']}")

        if 'publish_date' in item:
            lines.append("")
            lines.append(f"### Published: {item['publish_date']}")

        if 'author' in item:
            lines.append("")
            lines.append(f"### Author: {item['author']}")

        return "\n".join(lines)

    def download_media(self, url: str, item_id: str, media_type: str = "image") -> Optional[str]:
        """Download media file and return local path"""
        if not url:
            return None

        try:
            # Parse URL to get filename
            parsed = urlparse(url)
            original_filename = Path(unquote(parsed.path)).name

            # Generate safe filename
            if not original_filename or '.' not in original_filename:
                # Use hash if no proper filename
                url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
                ext = self._guess_extension(url, media_type)
                filename = f"{item_id}_{url_hash}{ext}"
            else:
                # Clean filename
                filename = self._sanitize_filename(f"{item_id}_{original_filename}")

            # Media directory path
            media_dir = self.config.data_dir / "media" / self.config.source_name.title()
            media_dir.mkdir(parents=True, exist_ok=True)

            file_path = media_dir / filename

            # Skip if already downloaded
            if file_path.exists():
                self.logger.debug(f"Media already exists: {filename}")
                return str(file_path)

            # Download with retry logic
            self.logger.info(f"Downloading media: {url}")
            response = self.make_request('GET', url, stream=True, timeout=30)
            response.raise_for_status()

            # Write file
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            self.logger.info(f"Downloaded media: {filename} ({file_path.stat().st_size} bytes)")
            return str(file_path)

        except Exception as e:
            self.logger.warning(f"Failed to download media {url}: {e}")
            return None

    def _sanitize_filename(self, filename: str) -> str:
        """Sanitize filename for filesystem safety"""
        import re
        # Remove or replace problematic characters
        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
        # Limit length
        name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
        if len(name) > 100:
            name = name[:100]
        return f"{name}.{ext}" if ext else name

    def _guess_extension(self, url: str, media_type: str) -> str:
        """Guess file extension from URL or media type"""
        if 'image' in media_type.lower():
            return '.jpg'
        elif 'video' in media_type.lower():
            return '.mp4'
        elif 'audio' in media_type.lower():
            return '.mp3'
        else:
            # Try to guess from URL
            if any(x in url.lower() for x in ['.jpg', '.jpeg', '.png', '.gif']):
                return '.jpg'
            elif any(x in url.lower() for x in ['.mp4', '.mov', '.avi']):
                return '.mp4'
            elif any(x in url.lower() for x in ['.mp3', '.wav', '.m4a']):
                return '.mp3'
            else:
                return '.bin'  # Generic binary

    @abstractmethod
    def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
        pass

    def run(self) -> None:
        try:
            self.logger.info(f"Starting {self.config.source_name} scraper")

            # Load state
            state = self.load_state()

            # Fetch content
            all_items = self.fetch_content()

            # Filter for new items only
            new_items = self.get_incremental_items(all_items, state)

            if not new_items:
                self.logger.info("No new items found")
                return

            self.logger.info(f"Found {len(new_items)} new items")

            # Convert to markdown
            markdown_content = self.format_markdown(new_items)

            # Save markdown
            filepath = self.save_markdown(markdown_content)

            # Update state
            if new_items:
                # Update state with latest item info
                state['last_update'] = datetime.now(self.tz).isoformat()
                state['last_item_count'] = len(new_items)
                # Subclasses should update specific tracking fields
                state = self.update_state(state, new_items)
                self.save_state(state)

            self.logger.info(f"Successfully processed {len(new_items)} items")

        except Exception as e:
            self.logger.error(f"Error in scraper run: {e}")
            raise

    @abstractmethod
    def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
        pass