import json import logging import shutil import hashlib from abc import ABC, abstractmethod from dataclasses import dataclass from datetime import datetime from logging.handlers import RotatingFileHandler from pathlib import Path from typing import Any, Dict, List, Optional from urllib.parse import urlparse, unquote import pytz import requests from markitdown import MarkItDown from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type @dataclass class ScraperConfig: source_name: str brand_name: str data_dir: Path logs_dir: Path timezone: str = "America/Halifax" class BaseScraper(ABC): def __init__(self, config: ScraperConfig): self.config = config self.state_file = config.data_dir / ".state" / f"{config.source_name}_state.json" self.tz = pytz.timezone(config.timezone) self.converter = MarkItDown() # HTTP Session for connection pooling self.session = requests.Session() # User agent rotation pool self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0', 'HVAC-KnowItAll-Bot/1.0 (+https://hvacknowitall.com)' # Fallback bot UA ] self.current_ua_index = 0 # Retry configuration from production config self.retry_config = { "max_attempts": 3, "initial_delay": 5, "backoff_factor": 2, "max_delay": 60 } # Ensure directories exist BEFORE setting up logger self.state_file.parent.mkdir(parents=True, exist_ok=True) (config.data_dir / "markdown_current").mkdir(parents=True, exist_ok=True) (config.data_dir / "markdown_archives" / config.source_name.title()).mkdir(parents=True, exist_ok=True) (config.data_dir / "media" / config.source_name.title()).mkdir(parents=True, exist_ok=True) (config.logs_dir / config.source_name.title()).mkdir(parents=True, exist_ok=True) # Now setup logger after directories exist self.logger = self._setup_logger() # Set initial user agent (after logger is set up) self.rotate_user_agent() def _setup_logger(self) -> logging.Logger: logger = logging.getLogger(f"{self.config.brand_name}_{self.config.source_name}") logger.setLevel(logging.DEBUG) # Console handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) # File handler with rotation log_file = self.config.logs_dir / self.config.source_name.title() / f"{self.config.source_name}.log" file_handler = RotatingFileHandler( log_file, maxBytes=10 * 1024 * 1024, # 10MB backupCount=5 ) file_handler.setLevel(logging.DEBUG) # Formatter formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) console_handler.setFormatter(formatter) file_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.addHandler(file_handler) return logger def get_retry_decorator(self): """Get a configured retry decorator for HTTP requests""" return retry( stop=stop_after_attempt(self.retry_config["max_attempts"]), wait=wait_exponential( multiplier=self.retry_config["backoff_factor"], min=self.retry_config["initial_delay"], max=self.retry_config["max_delay"] ), retry=retry_if_exception_type((requests.RequestException, ConnectionError, TimeoutError)), before_sleep=lambda retry_state: self.logger.warning( f"Retry attempt {retry_state.attempt_number} after {retry_state.next_action.sleep} seconds" ) ) def make_request(self, *args, **kwargs): """Make an HTTP request with retry logic, connection pooling, and user agent rotation""" # Rotate user agent every 5 requests to avoid detection import random if random.randint(1, 5) == 1: self.rotate_user_agent() @self.get_retry_decorator() def _make_request(): return self.session.request(*args, **kwargs) return _make_request() def rotate_user_agent(self): """Rotate to the next user agent in the pool""" self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents) user_agent = self.user_agents[self.current_ua_index] self.session.headers.update({'User-Agent': user_agent}) self.logger.debug(f"Rotated to user agent: {user_agent[:50]}...") def load_state(self) -> Dict[str, Any]: if not self.state_file.exists(): self.logger.info(f"No state file found at {self.state_file}, starting fresh") return {} try: with open(self.state_file, 'r') as f: state = json.load(f) self.logger.debug(f"Loaded state: {state}") return state except Exception as e: self.logger.error(f"Error loading state: {e}") return {} def save_state(self, state: Dict[str, Any]) -> None: try: self.state_file.parent.mkdir(parents=True, exist_ok=True) with open(self.state_file, 'w') as f: json.dump(state, f, indent=2) self.logger.debug(f"Saved state: {state}") except Exception as e: self.logger.error(f"Error saving state: {e}") def generate_filename(self) -> str: now = datetime.now(self.tz) timestamp = now.strftime("%Y-%d-%m-T%H%M%S") return f"{self.config.brand_name}_{self.config.source_name}_{timestamp}.md" def archive_current_file(self) -> None: current_dir = self.config.data_dir / "markdown_current" archive_dir = self.config.data_dir / "markdown_archives" / self.config.source_name.title() pattern = f"{self.config.brand_name}_{self.config.source_name}_*.md" current_files = list(current_dir.glob(pattern)) for file in current_files: archive_path = archive_dir / file.name try: shutil.move(str(file), str(archive_path)) self.logger.info(f"Archived {file.name} to {archive_dir}") except Exception as e: self.logger.error(f"Error archiving {file.name}: {e}") def convert_to_markdown(self, content: str, content_type: str = "text/html") -> str: try: if content_type == "text/html": # Use markdownify for HTML conversion - it handles Unicode properly from markdownify import markdownify as md import re # First, clean the HTML content # Remove script blocks and their content completely content = re.sub(r']*>.*?', '', content, flags=re.DOTALL | re.IGNORECASE) # Remove style blocks and their content completely content = re.sub(r']*>.*?', '', content, flags=re.DOTALL | re.IGNORECASE) # Remove inline JavaScript event handlers content = re.sub(r'\s*on\w+\s*=\s*"[^"]*"', '', content, flags=re.IGNORECASE) content = re.sub(r"\s*on\w+\s*=\s*'[^']*'", '', content, flags=re.IGNORECASE) # Convert HTML to Markdown with sensible defaults markdown = md(content, heading_style="ATX", # Use # for headings bullets="-", # Use - for bullet points strip=["script", "style", "meta", "link", "noscript"]) # Remove these tags completely # Post-process to clean up any remaining issues # Remove any remaining HTML tags that shouldn't be in markdown markdown = re.sub(r'', '\n', markdown, flags=re.IGNORECASE) # Clean up excessive blank lines markdown = re.sub(r'\n{3,}', '\n\n', markdown) # Fix malformed comparison operators that look like tags markdown = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', markdown) return markdown.strip() else: # For other content types, return as-is return content except ImportError: # Fall back to MarkItDown if markdownify is not available try: if content_type == "text/html": # Use file-based conversion which handles Unicode better import tempfile import os with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.html', delete=False) as f: f.write(content) temp_path = f.name try: result = self.converter.convert(temp_path) return result.text_content if hasattr(result, 'text_content') else str(result) finally: os.unlink(temp_path) else: return content except Exception as e: self.logger.error(f"Error converting to markdown: {e}") return content except Exception as e: self.logger.error(f"Error converting to markdown: {e}") # Fall back to returning the content as-is return content def save_markdown(self, content: str) -> Path: self.archive_current_file() filename = self.generate_filename() filepath = self.config.data_dir / "markdown_current" / filename try: with open(filepath, 'w', encoding='utf-8') as f: f.write(content) self.logger.info(f"Saved markdown to {filepath}") return filepath except Exception as e: self.logger.error(f"Error saving markdown: {e}") raise def download_media(self, url: str, filename: str) -> Optional[Path]: media_dir = self.config.data_dir / "media" / self.config.source_name.title() filepath = media_dir / filename # Implementation would download the file # Placeholder for now self.logger.debug(f"Would download {url} to {filepath}") return filepath @abstractmethod def fetch_content(self) -> List[Dict[str, Any]]: pass def format_markdown(self, items: List[Dict[str, Any]]) -> str: """Format items according to specification markdown format.""" if not items: return "" formatted_items = [] for item in items: # Use spec-compliant format formatted_item = self.format_item_to_spec(item) formatted_items.append(formatted_item) return "\n\n--------------\n\n".join(formatted_items) def format_item_to_spec(self, item: Dict[str, Any]) -> str: """Format a single item according to the specification format.""" lines = [] # ID (required) item_id = item.get('id', item.get('url', 'unknown')) lines.append(f"# ID: {item_id}") lines.append("") # Title (required) title = item.get('title', 'Untitled') lines.append(f"## Title: {title}") lines.append("") # Type (required) content_type = item.get('type', self.config.source_name) lines.append(f"## Type: {content_type}") lines.append("") # Permalink (required) permalink = item.get('url', item.get('link', 'N/A')) lines.append(f"## Permalink: {permalink}") lines.append("") # Description (required) description = item.get('description', item.get('content', '')) if isinstance(description, list): description = ' '.join(description) # Clean up description description = description.strip() if description else 'No description available' lines.append("## Description:") lines.append(description) lines.append("") # Metadata section lines.append("## Metadata:") lines.append("") # Comments comments = item.get('comments', item.get('comment_count', 0)) lines.append(f"### Comments: {comments}") lines.append("") # Likes likes = item.get('likes', item.get('like_count', 0)) lines.append(f"### Likes: {likes}") lines.append("") # Tags tags = item.get('tags', item.get('categories', [])) if tags: lines.append("### Tags:") for tag in tags: tag_name = tag if isinstance(tag, str) else tag.get('name', str(tag)) lines.append(f"- {tag_name}") else: lines.append("### Tags:") lines.append("- No tags") # Additional metadata (optional) if 'views' in item: lines.append("") lines.append(f"### Views: {item['views']}") if 'publish_date' in item: lines.append("") lines.append(f"### Published: {item['publish_date']}") if 'author' in item: lines.append("") lines.append(f"### Author: {item['author']}") return "\n".join(lines) def download_media(self, url: str, item_id: str, media_type: str = "image") -> Optional[str]: """Download media file and return local path""" if not url: return None try: # Parse URL to get filename parsed = urlparse(url) original_filename = Path(unquote(parsed.path)).name # Generate safe filename if not original_filename or '.' not in original_filename: # Use hash if no proper filename url_hash = hashlib.md5(url.encode()).hexdigest()[:8] ext = self._guess_extension(url, media_type) filename = f"{item_id}_{url_hash}{ext}" else: # Clean filename filename = self._sanitize_filename(f"{item_id}_{original_filename}") # Media directory path media_dir = self.config.data_dir / "media" / self.config.source_name.title() media_dir.mkdir(parents=True, exist_ok=True) file_path = media_dir / filename # Skip if already downloaded if file_path.exists(): self.logger.debug(f"Media already exists: {filename}") return str(file_path) # Download with retry logic self.logger.info(f"Downloading media: {url}") response = self.make_request('GET', url, stream=True, timeout=30) response.raise_for_status() # Write file with open(file_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) self.logger.info(f"Downloaded media: {filename} ({file_path.stat().st_size} bytes)") return str(file_path) except Exception as e: self.logger.warning(f"Failed to download media {url}: {e}") return None def _sanitize_filename(self, filename: str) -> str: """Sanitize filename for filesystem safety""" import re # Remove or replace problematic characters filename = re.sub(r'[<>:"/\\|?*]', '_', filename) # Limit length name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '') if len(name) > 100: name = name[:100] return f"{name}.{ext}" if ext else name def _guess_extension(self, url: str, media_type: str) -> str: """Guess file extension from URL or media type""" if 'image' in media_type.lower(): return '.jpg' elif 'video' in media_type.lower(): return '.mp4' elif 'audio' in media_type.lower(): return '.mp3' else: # Try to guess from URL if any(x in url.lower() for x in ['.jpg', '.jpeg', '.png', '.gif']): return '.jpg' elif any(x in url.lower() for x in ['.mp4', '.mov', '.avi']): return '.mp4' elif any(x in url.lower() for x in ['.mp3', '.wav', '.m4a']): return '.mp3' else: return '.bin' # Generic binary @abstractmethod def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]: pass def run(self) -> None: try: self.logger.info(f"Starting {self.config.source_name} scraper") # Load state state = self.load_state() # Fetch content all_items = self.fetch_content() # Filter for new items only new_items = self.get_incremental_items(all_items, state) if not new_items: self.logger.info("No new items found") return self.logger.info(f"Found {len(new_items)} new items") # Convert to markdown markdown_content = self.format_markdown(new_items) # Save markdown filepath = self.save_markdown(markdown_content) # Update state if new_items: # Update state with latest item info state['last_update'] = datetime.now(self.tz).isoformat() state['last_item_count'] = len(new_items) # Subclasses should update specific tracking fields state = self.update_state(state, new_items) self.save_state(state) self.logger.info(f"Successfully processed {len(new_items)} items") except Exception as e: self.logger.error(f"Error in scraper run: {e}") raise @abstractmethod def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]: pass