hvac-kia-content/src/base_scraper.py
Ben Reed 8b83185130 Fix HTML/XML contamination in WordPress markdown extraction
- Update base_scraper.py convert_to_markdown() to properly clean HTML
- Remove script/style blocks and their content before conversion
- Strip inline JavaScript event handlers
- Clean up br tags and excessive blank lines
- Fix malformed comparison operators that look like tags
- Add comprehensive HTML cleaning during content extraction (not after)
- Test confirms WordPress content now generates clean markdown without HTML

This ensures all future WordPress scraping produces specification-compliant
markdown without any HTML/XML contamination.
2025-08-18 23:11:08 -03:00

483 lines
No EOL
19 KiB
Python

import json
import logging
import shutil
import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from logging.handlers import RotatingFileHandler
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse, unquote
import pytz
import requests
from markitdown import MarkItDown
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
@dataclass
class ScraperConfig:
source_name: str
brand_name: str
data_dir: Path
logs_dir: Path
timezone: str = "America/Halifax"
class BaseScraper(ABC):
def __init__(self, config: ScraperConfig):
self.config = config
self.state_file = config.data_dir / ".state" / f"{config.source_name}_state.json"
self.tz = pytz.timezone(config.timezone)
self.converter = MarkItDown()
# HTTP Session for connection pooling
self.session = requests.Session()
# User agent rotation pool
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
'HVAC-KnowItAll-Bot/1.0 (+https://hvacknowitall.com)' # Fallback bot UA
]
self.current_ua_index = 0
# Retry configuration from production config
self.retry_config = {
"max_attempts": 3,
"initial_delay": 5,
"backoff_factor": 2,
"max_delay": 60
}
# Ensure directories exist BEFORE setting up logger
self.state_file.parent.mkdir(parents=True, exist_ok=True)
(config.data_dir / "markdown_current").mkdir(parents=True, exist_ok=True)
(config.data_dir / "markdown_archives" / config.source_name.title()).mkdir(parents=True, exist_ok=True)
(config.data_dir / "media" / config.source_name.title()).mkdir(parents=True, exist_ok=True)
(config.logs_dir / config.source_name.title()).mkdir(parents=True, exist_ok=True)
# Now setup logger after directories exist
self.logger = self._setup_logger()
# Set initial user agent (after logger is set up)
self.rotate_user_agent()
def _setup_logger(self) -> logging.Logger:
logger = logging.getLogger(f"{self.config.brand_name}_{self.config.source_name}")
logger.setLevel(logging.DEBUG)
# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
# File handler with rotation
log_file = self.config.logs_dir / self.config.source_name.title() / f"{self.config.source_name}.log"
file_handler = RotatingFileHandler(
log_file,
maxBytes=10 * 1024 * 1024, # 10MB
backupCount=5
)
file_handler.setLevel(logging.DEBUG)
# Formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
def get_retry_decorator(self):
"""Get a configured retry decorator for HTTP requests"""
return retry(
stop=stop_after_attempt(self.retry_config["max_attempts"]),
wait=wait_exponential(
multiplier=self.retry_config["backoff_factor"],
min=self.retry_config["initial_delay"],
max=self.retry_config["max_delay"]
),
retry=retry_if_exception_type((requests.RequestException, ConnectionError, TimeoutError)),
before_sleep=lambda retry_state: self.logger.warning(
f"Retry attempt {retry_state.attempt_number} after {retry_state.next_action.sleep} seconds"
)
)
def make_request(self, *args, **kwargs):
"""Make an HTTP request with retry logic, connection pooling, and user agent rotation"""
# Rotate user agent every 5 requests to avoid detection
import random
if random.randint(1, 5) == 1:
self.rotate_user_agent()
@self.get_retry_decorator()
def _make_request():
return self.session.request(*args, **kwargs)
return _make_request()
def rotate_user_agent(self):
"""Rotate to the next user agent in the pool"""
self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents)
user_agent = self.user_agents[self.current_ua_index]
self.session.headers.update({'User-Agent': user_agent})
self.logger.debug(f"Rotated to user agent: {user_agent[:50]}...")
def load_state(self) -> Dict[str, Any]:
if not self.state_file.exists():
self.logger.info(f"No state file found at {self.state_file}, starting fresh")
return {}
try:
with open(self.state_file, 'r') as f:
state = json.load(f)
self.logger.debug(f"Loaded state: {state}")
return state
except Exception as e:
self.logger.error(f"Error loading state: {e}")
return {}
def save_state(self, state: Dict[str, Any]) -> None:
try:
self.state_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.state_file, 'w') as f:
json.dump(state, f, indent=2)
self.logger.debug(f"Saved state: {state}")
except Exception as e:
self.logger.error(f"Error saving state: {e}")
def generate_filename(self) -> str:
now = datetime.now(self.tz)
timestamp = now.strftime("%Y-%d-%m-T%H%M%S")
return f"{self.config.brand_name}_{self.config.source_name}_{timestamp}.md"
def archive_current_file(self) -> None:
current_dir = self.config.data_dir / "markdown_current"
archive_dir = self.config.data_dir / "markdown_archives" / self.config.source_name.title()
pattern = f"{self.config.brand_name}_{self.config.source_name}_*.md"
current_files = list(current_dir.glob(pattern))
for file in current_files:
archive_path = archive_dir / file.name
try:
shutil.move(str(file), str(archive_path))
self.logger.info(f"Archived {file.name} to {archive_dir}")
except Exception as e:
self.logger.error(f"Error archiving {file.name}: {e}")
def convert_to_markdown(self, content: str, content_type: str = "text/html") -> str:
try:
if content_type == "text/html":
# Use markdownify for HTML conversion - it handles Unicode properly
from markdownify import markdownify as md
import re
# First, clean the HTML content
# Remove script blocks and their content completely
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL | re.IGNORECASE)
# Remove style blocks and their content completely
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE)
# Remove inline JavaScript event handlers
content = re.sub(r'\s*on\w+\s*=\s*"[^"]*"', '', content, flags=re.IGNORECASE)
content = re.sub(r"\s*on\w+\s*=\s*'[^']*'", '', content, flags=re.IGNORECASE)
# Convert HTML to Markdown with sensible defaults
markdown = md(content,
heading_style="ATX", # Use # for headings
bullets="-", # Use - for bullet points
strip=["script", "style", "meta", "link", "noscript"]) # Remove these tags completely
# Post-process to clean up any remaining issues
# Remove any remaining HTML tags that shouldn't be in markdown
markdown = re.sub(r'<br\s*/?>', '\n', markdown, flags=re.IGNORECASE)
# Clean up excessive blank lines
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
# Fix malformed comparison operators that look like tags
markdown = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', markdown)
return markdown.strip()
else:
# For other content types, return as-is
return content
except ImportError:
# Fall back to MarkItDown if markdownify is not available
try:
if content_type == "text/html":
# Use file-based conversion which handles Unicode better
import tempfile
import os
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8',
suffix='.html', delete=False) as f:
f.write(content)
temp_path = f.name
try:
result = self.converter.convert(temp_path)
return result.text_content if hasattr(result, 'text_content') else str(result)
finally:
os.unlink(temp_path)
else:
return content
except Exception as e:
self.logger.error(f"Error converting to markdown: {e}")
return content
except Exception as e:
self.logger.error(f"Error converting to markdown: {e}")
# Fall back to returning the content as-is
return content
def save_markdown(self, content: str) -> Path:
self.archive_current_file()
filename = self.generate_filename()
filepath = self.config.data_dir / "markdown_current" / filename
try:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
self.logger.info(f"Saved markdown to {filepath}")
return filepath
except Exception as e:
self.logger.error(f"Error saving markdown: {e}")
raise
def download_media(self, url: str, filename: str) -> Optional[Path]:
media_dir = self.config.data_dir / "media" / self.config.source_name.title()
filepath = media_dir / filename
# Implementation would download the file
# Placeholder for now
self.logger.debug(f"Would download {url} to {filepath}")
return filepath
@abstractmethod
def fetch_content(self) -> List[Dict[str, Any]]:
pass
def format_markdown(self, items: List[Dict[str, Any]]) -> str:
"""Format items according to specification markdown format."""
if not items:
return ""
formatted_items = []
for item in items:
# Use spec-compliant format
formatted_item = self.format_item_to_spec(item)
formatted_items.append(formatted_item)
return "\n\n--------------\n\n".join(formatted_items)
def format_item_to_spec(self, item: Dict[str, Any]) -> str:
"""Format a single item according to the specification format."""
lines = []
# ID (required)
item_id = item.get('id', item.get('url', 'unknown'))
lines.append(f"# ID: {item_id}")
lines.append("")
# Title (required)
title = item.get('title', 'Untitled')
lines.append(f"## Title: {title}")
lines.append("")
# Type (required)
content_type = item.get('type', self.config.source_name)
lines.append(f"## Type: {content_type}")
lines.append("")
# Permalink (required)
permalink = item.get('url', item.get('link', 'N/A'))
lines.append(f"## Permalink: {permalink}")
lines.append("")
# Description (required)
description = item.get('description', item.get('content', ''))
if isinstance(description, list):
description = ' '.join(description)
# Clean up description
description = description.strip() if description else 'No description available'
lines.append("## Description:")
lines.append(description)
lines.append("")
# Metadata section
lines.append("## Metadata:")
lines.append("")
# Comments
comments = item.get('comments', item.get('comment_count', 0))
lines.append(f"### Comments: {comments}")
lines.append("")
# Likes
likes = item.get('likes', item.get('like_count', 0))
lines.append(f"### Likes: {likes}")
lines.append("")
# Tags
tags = item.get('tags', item.get('categories', []))
if tags:
lines.append("### Tags:")
for tag in tags:
tag_name = tag if isinstance(tag, str) else tag.get('name', str(tag))
lines.append(f"- {tag_name}")
else:
lines.append("### Tags:")
lines.append("- No tags")
# Additional metadata (optional)
if 'views' in item:
lines.append("")
lines.append(f"### Views: {item['views']}")
if 'publish_date' in item:
lines.append("")
lines.append(f"### Published: {item['publish_date']}")
if 'author' in item:
lines.append("")
lines.append(f"### Author: {item['author']}")
return "\n".join(lines)
def download_media(self, url: str, item_id: str, media_type: str = "image") -> Optional[str]:
"""Download media file and return local path"""
if not url:
return None
try:
# Parse URL to get filename
parsed = urlparse(url)
original_filename = Path(unquote(parsed.path)).name
# Generate safe filename
if not original_filename or '.' not in original_filename:
# Use hash if no proper filename
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
ext = self._guess_extension(url, media_type)
filename = f"{item_id}_{url_hash}{ext}"
else:
# Clean filename
filename = self._sanitize_filename(f"{item_id}_{original_filename}")
# Media directory path
media_dir = self.config.data_dir / "media" / self.config.source_name.title()
media_dir.mkdir(parents=True, exist_ok=True)
file_path = media_dir / filename
# Skip if already downloaded
if file_path.exists():
self.logger.debug(f"Media already exists: {filename}")
return str(file_path)
# Download with retry logic
self.logger.info(f"Downloading media: {url}")
response = self.make_request('GET', url, stream=True, timeout=30)
response.raise_for_status()
# Write file
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
self.logger.info(f"Downloaded media: {filename} ({file_path.stat().st_size} bytes)")
return str(file_path)
except Exception as e:
self.logger.warning(f"Failed to download media {url}: {e}")
return None
def _sanitize_filename(self, filename: str) -> str:
"""Sanitize filename for filesystem safety"""
import re
# Remove or replace problematic characters
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
# Limit length
name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
if len(name) > 100:
name = name[:100]
return f"{name}.{ext}" if ext else name
def _guess_extension(self, url: str, media_type: str) -> str:
"""Guess file extension from URL or media type"""
if 'image' in media_type.lower():
return '.jpg'
elif 'video' in media_type.lower():
return '.mp4'
elif 'audio' in media_type.lower():
return '.mp3'
else:
# Try to guess from URL
if any(x in url.lower() for x in ['.jpg', '.jpeg', '.png', '.gif']):
return '.jpg'
elif any(x in url.lower() for x in ['.mp4', '.mov', '.avi']):
return '.mp4'
elif any(x in url.lower() for x in ['.mp3', '.wav', '.m4a']):
return '.mp3'
else:
return '.bin' # Generic binary
@abstractmethod
def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
pass
def run(self) -> None:
try:
self.logger.info(f"Starting {self.config.source_name} scraper")
# Load state
state = self.load_state()
# Fetch content
all_items = self.fetch_content()
# Filter for new items only
new_items = self.get_incremental_items(all_items, state)
if not new_items:
self.logger.info("No new items found")
return
self.logger.info(f"Found {len(new_items)} new items")
# Convert to markdown
markdown_content = self.format_markdown(new_items)
# Save markdown
filepath = self.save_markdown(markdown_content)
# Update state
if new_items:
# Update state with latest item info
state['last_update'] = datetime.now(self.tz).isoformat()
state['last_item_count'] = len(new_items)
# Subclasses should update specific tracking fields
state = self.update_state(state, new_items)
self.save_state(state)
self.logger.info(f"Successfully processed {len(new_items)} items")
except Exception as e:
self.logger.error(f"Error in scraper run: {e}")
raise
@abstractmethod
def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
pass