hvac-kia-content/test_wordpress_clean.py
Ben Reed 8b83185130 Fix HTML/XML contamination in WordPress markdown extraction
- Update base_scraper.py convert_to_markdown() to properly clean HTML
- Remove script/style blocks and their content before conversion
- Strip inline JavaScript event handlers
- Clean up br tags and excessive blank lines
- Fix malformed comparison operators that look like tags
- Add comprehensive HTML cleaning during content extraction (not after)
- Test confirms WordPress content now generates clean markdown without HTML

This ensures all future WordPress scraping produces specification-compliant
markdown without any HTML/XML contamination.
2025-08-18 23:11:08 -03:00

73 lines
No EOL
2 KiB
Python

#!/usr/bin/env python3
"""Test WordPress scraper HTML cleaning"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.base_scraper import ScraperConfig
from src.wordpress_scraper import WordPressScraper
# Create test config
config = ScraperConfig(
source_name="wordpress",
brand_name="hvacknowitall",
data_dir=Path("test_data/wordpress_clean"),
logs_dir=Path("test_logs/wordpress_clean"),
timezone="America/Halifax"
)
# Initialize scraper
scraper = WordPressScraper(config)
# Fetch just 1 post to test
print("Fetching 1 WordPress post to test HTML cleaning...")
posts = scraper.fetch_content(max_items=1)
if posts:
print(f"✅ Fetched {len(posts)} post")
# Generate markdown
markdown = scraper.format_markdown(posts)
# Check for HTML contamination
import re
html_tags = re.findall(r'<(?!https?://)[^>]+>', markdown)
print(f"\nHTML tag check:")
if html_tags:
print(f" ⚠️ Found {len(html_tags)} HTML tags:")
for tag in html_tags[:10]:
print(f" - {tag}")
else:
print(f" ✅ No HTML tags found - content is clean!")
# Check for JavaScript
js_patterns = [
r'document\.',
r'function\s*\(',
r'gtag\(',
r'addEventListener'
]
js_found = False
for pattern in js_patterns:
if re.search(pattern, markdown):
print(f" ⚠️ Found JavaScript pattern: {pattern}")
js_found = True
if not js_found:
print(f" ✅ No JavaScript found - content is clean!")
# Save sample
output_file = Path("test_data/wordpress_clean/sample.md")
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(markdown, encoding='utf-8')
print(f"\n📄 Sample saved to: {output_file}")
# Show preview
print("\n📝 Content preview (first 500 chars):")
print("-" * 60)
print(markdown[:500])
else:
print("❌ No posts fetched")