hvac-kia-content/src/mailchimp_archive_scraper.py
Ben Reed 05218a873b Fix critical production issues and improve spec compliance
Production Readiness Improvements:
- Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM)
- Enabled NAS synchronization in production runner with error handling
- Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md)
- Made systemd services portable (removed hardcoded user/paths)
- Added environment variable validation on startup
- Moved DISPLAY/XAUTHORITY to .env configuration

Systemd Improvements:
- Created template service file (@.service) for any user
- Changed all paths to /opt/hvac-kia-content
- Updated installation script for portable deployment
- Fixed service dependencies and resource limits

Documentation:
- Created comprehensive PRODUCTION_TODO.md with 25 tasks
- Added PRODUCTION_GUIDE.md with deployment instructions
- Documented spec compliance gaps (65% complete)

Remaining work includes retry logic, connection pooling, media downloads,
and pytest test suite as documented in PRODUCTION_TODO.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 20:07:55 -03:00

317 lines
No EOL
12 KiB
Python

import os
import re
import requests
import time
import random
from typing import Any, Dict, List, Optional
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup
from src.base_scraper import BaseScraper, ScraperConfig
class MailChimpArchiveScraper(BaseScraper):
"""MailChimp campaign archive scraper using web scraping to access historical content."""
def __init__(self, config: ScraperConfig):
super().__init__(config)
# Extract user and list IDs from the RSS URL
rss_url = os.getenv('MAILCHIMP_RSS_URL', '')
self.user_id = self._extract_param(rss_url, 'u')
self.list_id = self._extract_param(rss_url, 'id')
if not self.user_id or not self.list_id:
self.logger.error("Could not extract user ID and list ID from MAILCHIMP_RSS_URL")
# Archive base URL
self.archive_base = f"https://us10.campaign-archive.com/home/?u={self.user_id}&id={self.list_id}"
# Session for persistent connections
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def _extract_param(self, url: str, param: str) -> str:
"""Extract parameter value from URL."""
match = re.search(f'{param}=([^&]+)', url)
return match.group(1) if match else ''
def _human_delay(self, min_seconds: float = 1, max_seconds: float = 3) -> None:
"""Add human-like delays between requests."""
delay = random.uniform(min_seconds, max_seconds)
self.logger.debug(f"Waiting {delay:.2f} seconds...")
time.sleep(delay)
def fetch_archive_pages(self, max_pages: int = 50) -> List[str]:
"""Fetch campaign archive pages and extract individual campaign URLs."""
campaign_urls = []
page = 1
try:
while page <= max_pages:
# MailChimp archive pagination (if it exists)
if page == 1:
url = self.archive_base
else:
# Try common pagination patterns
url = f"{self.archive_base}&page={page}"
self.logger.info(f"Fetching archive page {page}: {url}")
response = self.session.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Look for campaign links in various formats
campaign_links = []
# Method 1: Look for direct campaign links
for link in soup.find_all('a', href=True):
href = link['href']
if 'campaign-archive.com' in href and '&e=' in href:
if href not in campaign_links:
campaign_links.append(href)
# Method 2: Look for JavaScript-embedded campaign IDs
scripts = soup.find_all('script')
for script in scripts:
if script.string:
# Look for campaign IDs in JavaScript
campaign_ids = re.findall(r'id["\']?\s*:\s*["\']([a-f0-9]+)["\']', script.string)
for campaign_id in campaign_ids:
campaign_url = f"https://us10.campaign-archive.com/?u={self.user_id}&id={campaign_id}"
if campaign_url not in campaign_links:
campaign_links.append(campaign_url)
if not campaign_links:
self.logger.info(f"No more campaigns found on page {page}, stopping")
break
campaign_urls.extend(campaign_links)
self.logger.info(f"Found {len(campaign_links)} campaigns on page {page}")
# Check for pagination indicators
has_next = soup.find('a', string=re.compile(r'next|more|older', re.I))
if not has_next and page > 1:
self.logger.info("No more pages found")
break
page += 1
self._human_delay(2, 5) # Be respectful to MailChimp
except Exception as e:
self.logger.error(f"Error fetching archive pages: {e}")
# Remove duplicates and sort
unique_urls = list(set(campaign_urls))
self.logger.info(f"Found {len(unique_urls)} unique campaign URLs")
return unique_urls
def fetch_campaign_content(self, campaign_url: str) -> Optional[Dict[str, Any]]:
"""Fetch content from a single campaign URL."""
try:
self.logger.debug(f"Fetching campaign: {campaign_url}")
response = self.session.get(campaign_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract campaign data
campaign_data = {
'id': self._extract_campaign_id(campaign_url),
'title': self._extract_title(soup),
'date': self._extract_date(soup),
'content': self._extract_content(soup),
'link': campaign_url
}
return campaign_data
except Exception as e:
self.logger.error(f"Error fetching campaign {campaign_url}: {e}")
return None
def _extract_campaign_id(self, url: str) -> str:
"""Extract campaign ID from URL."""
match = re.search(r'id=([a-f0-9]+)', url)
return match.group(1) if match else ''
def _extract_title(self, soup: BeautifulSoup) -> str:
"""Extract campaign title."""
# Try multiple selectors for title
title_selectors = ['title', 'h1', '.mcnTextContent h1', '.headerContainer h1']
for selector in title_selectors:
element = soup.select_one(selector)
if element and element.get_text(strip=True):
title = element.get_text(strip=True)
# Clean up common MailChimp title artifacts
title = re.sub(r'\s*\|\s*HVAC Know It All.*$', '', title)
return title
return "Untitled Campaign"
def _extract_date(self, soup: BeautifulSoup) -> str:
"""Extract campaign send date."""
# Look for date indicators in various formats
date_patterns = [
r'(\w+ \d{1,2}, \d{4})', # January 15, 2023
r'(\d{1,2}/\d{1,2}/\d{4})', # 1/15/2023
r'(\d{4}-\d{2}-\d{2})', # 2023-01-15
]
# Search in text content
text = soup.get_text()
for pattern in date_patterns:
match = re.search(pattern, text)
if match:
try:
# Try to parse and standardize the date
date_str = match.group(1)
# You could add date parsing logic here
return date_str
except:
continue
# Fallback to current date if no date found
return datetime.now(self.tz).isoformat()
def _extract_content(self, soup: BeautifulSoup) -> str:
"""Extract campaign content."""
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Try to find the main content area
content_selectors = [
'.mcnTextContent',
'.bodyContainer',
'.templateContainer',
'#templateBody',
'body'
]
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
# Convert to markdown-like format
content = self.convert_to_markdown(str(content_elem))
if content and len(content.strip()) > 100: # Reasonable content length
return content
# Fallback to all text
return soup.get_text(separator='\n', strip=True)
def fetch_content(self, max_campaigns: int = 100) -> List[Dict[str, Any]]:
"""Fetch historical MailChimp campaigns."""
campaigns_data = []
try:
self.logger.info(f"Starting MailChimp archive scraping for {max_campaigns} campaigns")
# Get campaign URLs from archive pages
campaign_urls = self.fetch_archive_pages(max_pages=20)
if not campaign_urls:
self.logger.warning("No campaign URLs found")
return campaigns_data
# Limit to requested number
campaign_urls = campaign_urls[:max_campaigns]
# Fetch content from each campaign
for i, url in enumerate(campaign_urls):
campaign_data = self.fetch_campaign_content(url)
if campaign_data:
campaigns_data.append(campaign_data)
if (i + 1) % 10 == 0:
self.logger.info(f"Processed {i + 1}/{len(campaign_urls)} campaigns")
# Rate limiting
self._human_delay(1, 3)
self.logger.info(f"Successfully fetched {len(campaigns_data)} campaigns")
except Exception as e:
self.logger.error(f"Error in fetch_content: {e}")
return campaigns_data
def format_markdown(self, items: List[Dict[str, Any]]) -> str:
"""Format MailChimp campaigns as markdown."""
markdown_sections = []
for item in items:
section = []
# ID
section.append(f"# ID: {item.get('id', 'N/A')}")
section.append("")
# Title
section.append(f"## Title: {item.get('title', 'Untitled')}")
section.append("")
# Date
section.append(f"## Date: {item.get('date', '')}")
section.append("")
# Link
section.append(f"## Link: {item.get('link', '')}")
section.append("")
# Content
section.append("## Content:")
content = item.get('content', '')
if content:
# Limit content length for readability
if len(content) > 5000:
content = content[:5000] + "..."
section.append(content)
section.append("")
# Separator
section.append("-" * 50)
section.append("")
markdown_sections.append('\n'.join(section))
return '\n'.join(markdown_sections)
def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Get only new campaigns since last sync."""
if not state:
return items
last_campaign_id = state.get('last_campaign_id')
if not last_campaign_id:
return items
# Filter for campaigns newer than the last synced
new_items = []
for item in items:
if item.get('id') == last_campaign_id:
break # Found the last synced campaign
new_items.append(item)
return new_items
def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Update state with latest campaign information."""
if not items:
return state
# Get the first item (most recent)
latest_item = items[0]
state['last_campaign_id'] = latest_item.get('id')
state['last_campaign_date'] = latest_item.get('date')
state['last_sync'] = datetime.now(self.tz).isoformat()
state['campaign_count'] = len(items)
return state