- Update base_scraper.py convert_to_markdown() to properly clean HTML - Remove script/style blocks and their content before conversion - Strip inline JavaScript event handlers - Clean up br tags and excessive blank lines - Fix malformed comparison operators that look like tags - Add comprehensive HTML cleaning during content extraction (not after) - Test confirms WordPress content now generates clean markdown without HTML This ensures all future WordPress scraping produces specification-compliant markdown without any HTML/XML contamination.
483 lines
No EOL
19 KiB
Python
483 lines
No EOL
19 KiB
Python
import json
|
|
import logging
|
|
import shutil
|
|
import hashlib
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from logging.handlers import RotatingFileHandler
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
import pytz
|
|
import requests
|
|
from markitdown import MarkItDown
|
|
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
|
|
|
|
|
@dataclass
|
|
class ScraperConfig:
|
|
source_name: str
|
|
brand_name: str
|
|
data_dir: Path
|
|
logs_dir: Path
|
|
timezone: str = "America/Halifax"
|
|
|
|
|
|
class BaseScraper(ABC):
|
|
def __init__(self, config: ScraperConfig):
|
|
self.config = config
|
|
self.state_file = config.data_dir / ".state" / f"{config.source_name}_state.json"
|
|
self.tz = pytz.timezone(config.timezone)
|
|
self.converter = MarkItDown()
|
|
|
|
# HTTP Session for connection pooling
|
|
self.session = requests.Session()
|
|
|
|
# User agent rotation pool
|
|
self.user_agents = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
'HVAC-KnowItAll-Bot/1.0 (+https://hvacknowitall.com)' # Fallback bot UA
|
|
]
|
|
self.current_ua_index = 0
|
|
|
|
# Retry configuration from production config
|
|
self.retry_config = {
|
|
"max_attempts": 3,
|
|
"initial_delay": 5,
|
|
"backoff_factor": 2,
|
|
"max_delay": 60
|
|
}
|
|
|
|
# Ensure directories exist BEFORE setting up logger
|
|
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
(config.data_dir / "markdown_current").mkdir(parents=True, exist_ok=True)
|
|
(config.data_dir / "markdown_archives" / config.source_name.title()).mkdir(parents=True, exist_ok=True)
|
|
(config.data_dir / "media" / config.source_name.title()).mkdir(parents=True, exist_ok=True)
|
|
(config.logs_dir / config.source_name.title()).mkdir(parents=True, exist_ok=True)
|
|
|
|
# Now setup logger after directories exist
|
|
self.logger = self._setup_logger()
|
|
|
|
# Set initial user agent (after logger is set up)
|
|
self.rotate_user_agent()
|
|
|
|
def _setup_logger(self) -> logging.Logger:
|
|
logger = logging.getLogger(f"{self.config.brand_name}_{self.config.source_name}")
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
# Console handler
|
|
console_handler = logging.StreamHandler()
|
|
console_handler.setLevel(logging.INFO)
|
|
|
|
# File handler with rotation
|
|
log_file = self.config.logs_dir / self.config.source_name.title() / f"{self.config.source_name}.log"
|
|
file_handler = RotatingFileHandler(
|
|
log_file,
|
|
maxBytes=10 * 1024 * 1024, # 10MB
|
|
backupCount=5
|
|
)
|
|
file_handler.setLevel(logging.DEBUG)
|
|
|
|
# Formatter
|
|
formatter = logging.Formatter(
|
|
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
console_handler.setFormatter(formatter)
|
|
file_handler.setFormatter(formatter)
|
|
|
|
logger.addHandler(console_handler)
|
|
logger.addHandler(file_handler)
|
|
|
|
return logger
|
|
|
|
def get_retry_decorator(self):
|
|
"""Get a configured retry decorator for HTTP requests"""
|
|
return retry(
|
|
stop=stop_after_attempt(self.retry_config["max_attempts"]),
|
|
wait=wait_exponential(
|
|
multiplier=self.retry_config["backoff_factor"],
|
|
min=self.retry_config["initial_delay"],
|
|
max=self.retry_config["max_delay"]
|
|
),
|
|
retry=retry_if_exception_type((requests.RequestException, ConnectionError, TimeoutError)),
|
|
before_sleep=lambda retry_state: self.logger.warning(
|
|
f"Retry attempt {retry_state.attempt_number} after {retry_state.next_action.sleep} seconds"
|
|
)
|
|
)
|
|
|
|
def make_request(self, *args, **kwargs):
|
|
"""Make an HTTP request with retry logic, connection pooling, and user agent rotation"""
|
|
# Rotate user agent every 5 requests to avoid detection
|
|
import random
|
|
if random.randint(1, 5) == 1:
|
|
self.rotate_user_agent()
|
|
|
|
@self.get_retry_decorator()
|
|
def _make_request():
|
|
return self.session.request(*args, **kwargs)
|
|
|
|
return _make_request()
|
|
|
|
def rotate_user_agent(self):
|
|
"""Rotate to the next user agent in the pool"""
|
|
self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents)
|
|
user_agent = self.user_agents[self.current_ua_index]
|
|
self.session.headers.update({'User-Agent': user_agent})
|
|
self.logger.debug(f"Rotated to user agent: {user_agent[:50]}...")
|
|
|
|
def load_state(self) -> Dict[str, Any]:
|
|
if not self.state_file.exists():
|
|
self.logger.info(f"No state file found at {self.state_file}, starting fresh")
|
|
return {}
|
|
|
|
try:
|
|
with open(self.state_file, 'r') as f:
|
|
state = json.load(f)
|
|
self.logger.debug(f"Loaded state: {state}")
|
|
return state
|
|
except Exception as e:
|
|
self.logger.error(f"Error loading state: {e}")
|
|
return {}
|
|
|
|
def save_state(self, state: Dict[str, Any]) -> None:
|
|
try:
|
|
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(self.state_file, 'w') as f:
|
|
json.dump(state, f, indent=2)
|
|
self.logger.debug(f"Saved state: {state}")
|
|
except Exception as e:
|
|
self.logger.error(f"Error saving state: {e}")
|
|
|
|
def generate_filename(self) -> str:
|
|
now = datetime.now(self.tz)
|
|
timestamp = now.strftime("%Y-%d-%m-T%H%M%S")
|
|
return f"{self.config.brand_name}_{self.config.source_name}_{timestamp}.md"
|
|
|
|
def archive_current_file(self) -> None:
|
|
current_dir = self.config.data_dir / "markdown_current"
|
|
archive_dir = self.config.data_dir / "markdown_archives" / self.config.source_name.title()
|
|
|
|
pattern = f"{self.config.brand_name}_{self.config.source_name}_*.md"
|
|
current_files = list(current_dir.glob(pattern))
|
|
|
|
for file in current_files:
|
|
archive_path = archive_dir / file.name
|
|
try:
|
|
shutil.move(str(file), str(archive_path))
|
|
self.logger.info(f"Archived {file.name} to {archive_dir}")
|
|
except Exception as e:
|
|
self.logger.error(f"Error archiving {file.name}: {e}")
|
|
|
|
def convert_to_markdown(self, content: str, content_type: str = "text/html") -> str:
|
|
try:
|
|
if content_type == "text/html":
|
|
# Use markdownify for HTML conversion - it handles Unicode properly
|
|
from markdownify import markdownify as md
|
|
import re
|
|
|
|
# First, clean the HTML content
|
|
# Remove script blocks and their content completely
|
|
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
# Remove style blocks and their content completely
|
|
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
# Remove inline JavaScript event handlers
|
|
content = re.sub(r'\s*on\w+\s*=\s*"[^"]*"', '', content, flags=re.IGNORECASE)
|
|
content = re.sub(r"\s*on\w+\s*=\s*'[^']*'", '', content, flags=re.IGNORECASE)
|
|
|
|
# Convert HTML to Markdown with sensible defaults
|
|
markdown = md(content,
|
|
heading_style="ATX", # Use # for headings
|
|
bullets="-", # Use - for bullet points
|
|
strip=["script", "style", "meta", "link", "noscript"]) # Remove these tags completely
|
|
|
|
# Post-process to clean up any remaining issues
|
|
# Remove any remaining HTML tags that shouldn't be in markdown
|
|
markdown = re.sub(r'<br\s*/?>', '\n', markdown, flags=re.IGNORECASE)
|
|
|
|
# Clean up excessive blank lines
|
|
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
|
|
|
|
# Fix malformed comparison operators that look like tags
|
|
markdown = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', markdown)
|
|
|
|
return markdown.strip()
|
|
else:
|
|
# For other content types, return as-is
|
|
return content
|
|
except ImportError:
|
|
# Fall back to MarkItDown if markdownify is not available
|
|
try:
|
|
if content_type == "text/html":
|
|
# Use file-based conversion which handles Unicode better
|
|
import tempfile
|
|
import os
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8',
|
|
suffix='.html', delete=False) as f:
|
|
f.write(content)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
result = self.converter.convert(temp_path)
|
|
return result.text_content if hasattr(result, 'text_content') else str(result)
|
|
finally:
|
|
os.unlink(temp_path)
|
|
else:
|
|
return content
|
|
except Exception as e:
|
|
self.logger.error(f"Error converting to markdown: {e}")
|
|
return content
|
|
except Exception as e:
|
|
self.logger.error(f"Error converting to markdown: {e}")
|
|
# Fall back to returning the content as-is
|
|
return content
|
|
|
|
def save_markdown(self, content: str) -> Path:
|
|
self.archive_current_file()
|
|
|
|
filename = self.generate_filename()
|
|
filepath = self.config.data_dir / "markdown_current" / filename
|
|
|
|
try:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
self.logger.info(f"Saved markdown to {filepath}")
|
|
return filepath
|
|
except Exception as e:
|
|
self.logger.error(f"Error saving markdown: {e}")
|
|
raise
|
|
|
|
def download_media(self, url: str, filename: str) -> Optional[Path]:
|
|
media_dir = self.config.data_dir / "media" / self.config.source_name.title()
|
|
filepath = media_dir / filename
|
|
|
|
# Implementation would download the file
|
|
# Placeholder for now
|
|
self.logger.debug(f"Would download {url} to {filepath}")
|
|
return filepath
|
|
|
|
@abstractmethod
|
|
def fetch_content(self) -> List[Dict[str, Any]]:
|
|
pass
|
|
|
|
def format_markdown(self, items: List[Dict[str, Any]]) -> str:
|
|
"""Format items according to specification markdown format."""
|
|
if not items:
|
|
return ""
|
|
|
|
formatted_items = []
|
|
for item in items:
|
|
# Use spec-compliant format
|
|
formatted_item = self.format_item_to_spec(item)
|
|
formatted_items.append(formatted_item)
|
|
|
|
return "\n\n--------------\n\n".join(formatted_items)
|
|
|
|
def format_item_to_spec(self, item: Dict[str, Any]) -> str:
|
|
"""Format a single item according to the specification format."""
|
|
lines = []
|
|
|
|
# ID (required)
|
|
item_id = item.get('id', item.get('url', 'unknown'))
|
|
lines.append(f"# ID: {item_id}")
|
|
lines.append("")
|
|
|
|
# Title (required)
|
|
title = item.get('title', 'Untitled')
|
|
lines.append(f"## Title: {title}")
|
|
lines.append("")
|
|
|
|
# Type (required)
|
|
content_type = item.get('type', self.config.source_name)
|
|
lines.append(f"## Type: {content_type}")
|
|
lines.append("")
|
|
|
|
# Permalink (required)
|
|
permalink = item.get('url', item.get('link', 'N/A'))
|
|
lines.append(f"## Permalink: {permalink}")
|
|
lines.append("")
|
|
|
|
# Description (required)
|
|
description = item.get('description', item.get('content', ''))
|
|
if isinstance(description, list):
|
|
description = ' '.join(description)
|
|
# Clean up description
|
|
description = description.strip() if description else 'No description available'
|
|
lines.append("## Description:")
|
|
lines.append(description)
|
|
lines.append("")
|
|
|
|
# Metadata section
|
|
lines.append("## Metadata:")
|
|
lines.append("")
|
|
|
|
# Comments
|
|
comments = item.get('comments', item.get('comment_count', 0))
|
|
lines.append(f"### Comments: {comments}")
|
|
lines.append("")
|
|
|
|
# Likes
|
|
likes = item.get('likes', item.get('like_count', 0))
|
|
lines.append(f"### Likes: {likes}")
|
|
lines.append("")
|
|
|
|
# Tags
|
|
tags = item.get('tags', item.get('categories', []))
|
|
if tags:
|
|
lines.append("### Tags:")
|
|
for tag in tags:
|
|
tag_name = tag if isinstance(tag, str) else tag.get('name', str(tag))
|
|
lines.append(f"- {tag_name}")
|
|
else:
|
|
lines.append("### Tags:")
|
|
lines.append("- No tags")
|
|
|
|
# Additional metadata (optional)
|
|
if 'views' in item:
|
|
lines.append("")
|
|
lines.append(f"### Views: {item['views']}")
|
|
|
|
if 'publish_date' in item:
|
|
lines.append("")
|
|
lines.append(f"### Published: {item['publish_date']}")
|
|
|
|
if 'author' in item:
|
|
lines.append("")
|
|
lines.append(f"### Author: {item['author']}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def download_media(self, url: str, item_id: str, media_type: str = "image") -> Optional[str]:
|
|
"""Download media file and return local path"""
|
|
if not url:
|
|
return None
|
|
|
|
try:
|
|
# Parse URL to get filename
|
|
parsed = urlparse(url)
|
|
original_filename = Path(unquote(parsed.path)).name
|
|
|
|
# Generate safe filename
|
|
if not original_filename or '.' not in original_filename:
|
|
# Use hash if no proper filename
|
|
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
|
ext = self._guess_extension(url, media_type)
|
|
filename = f"{item_id}_{url_hash}{ext}"
|
|
else:
|
|
# Clean filename
|
|
filename = self._sanitize_filename(f"{item_id}_{original_filename}")
|
|
|
|
# Media directory path
|
|
media_dir = self.config.data_dir / "media" / self.config.source_name.title()
|
|
media_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
file_path = media_dir / filename
|
|
|
|
# Skip if already downloaded
|
|
if file_path.exists():
|
|
self.logger.debug(f"Media already exists: {filename}")
|
|
return str(file_path)
|
|
|
|
# Download with retry logic
|
|
self.logger.info(f"Downloading media: {url}")
|
|
response = self.make_request('GET', url, stream=True, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
# Write file
|
|
with open(file_path, 'wb') as f:
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
|
|
self.logger.info(f"Downloaded media: {filename} ({file_path.stat().st_size} bytes)")
|
|
return str(file_path)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to download media {url}: {e}")
|
|
return None
|
|
|
|
def _sanitize_filename(self, filename: str) -> str:
|
|
"""Sanitize filename for filesystem safety"""
|
|
import re
|
|
# Remove or replace problematic characters
|
|
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
|
# Limit length
|
|
name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
|
|
if len(name) > 100:
|
|
name = name[:100]
|
|
return f"{name}.{ext}" if ext else name
|
|
|
|
def _guess_extension(self, url: str, media_type: str) -> str:
|
|
"""Guess file extension from URL or media type"""
|
|
if 'image' in media_type.lower():
|
|
return '.jpg'
|
|
elif 'video' in media_type.lower():
|
|
return '.mp4'
|
|
elif 'audio' in media_type.lower():
|
|
return '.mp3'
|
|
else:
|
|
# Try to guess from URL
|
|
if any(x in url.lower() for x in ['.jpg', '.jpeg', '.png', '.gif']):
|
|
return '.jpg'
|
|
elif any(x in url.lower() for x in ['.mp4', '.mov', '.avi']):
|
|
return '.mp4'
|
|
elif any(x in url.lower() for x in ['.mp3', '.wav', '.m4a']):
|
|
return '.mp3'
|
|
else:
|
|
return '.bin' # Generic binary
|
|
|
|
@abstractmethod
|
|
def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
pass
|
|
|
|
def run(self) -> None:
|
|
try:
|
|
self.logger.info(f"Starting {self.config.source_name} scraper")
|
|
|
|
# Load state
|
|
state = self.load_state()
|
|
|
|
# Fetch content
|
|
all_items = self.fetch_content()
|
|
|
|
# Filter for new items only
|
|
new_items = self.get_incremental_items(all_items, state)
|
|
|
|
if not new_items:
|
|
self.logger.info("No new items found")
|
|
return
|
|
|
|
self.logger.info(f"Found {len(new_items)} new items")
|
|
|
|
# Convert to markdown
|
|
markdown_content = self.format_markdown(new_items)
|
|
|
|
# Save markdown
|
|
filepath = self.save_markdown(markdown_content)
|
|
|
|
# Update state
|
|
if new_items:
|
|
# Update state with latest item info
|
|
state['last_update'] = datetime.now(self.tz).isoformat()
|
|
state['last_item_count'] = len(new_items)
|
|
# Subclasses should update specific tracking fields
|
|
state = self.update_state(state, new_items)
|
|
self.save_state(state)
|
|
|
|
self.logger.info(f"Successfully processed {len(new_items)} items")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error in scraper run: {e}")
|
|
raise
|
|
|
|
@abstractmethod
|
|
def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
pass |