- Update base_scraper.py convert_to_markdown() to properly clean HTML - Remove script/style blocks and their content before conversion - Strip inline JavaScript event handlers - Clean up br tags and excessive blank lines - Fix malformed comparison operators that look like tags - Add comprehensive HTML cleaning during content extraction (not after) - Test confirms WordPress content now generates clean markdown without HTML This ensures all future WordPress scraping produces specification-compliant markdown without any HTML/XML contamination.
73 lines
No EOL
2 KiB
Python
73 lines
No EOL
2 KiB
Python
#!/usr/bin/env python3
|
|
"""Test WordPress scraper HTML cleaning"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.base_scraper import ScraperConfig
|
|
from src.wordpress_scraper import WordPressScraper
|
|
|
|
# Create test config
|
|
config = ScraperConfig(
|
|
source_name="wordpress",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("test_data/wordpress_clean"),
|
|
logs_dir=Path("test_logs/wordpress_clean"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
# Initialize scraper
|
|
scraper = WordPressScraper(config)
|
|
|
|
# Fetch just 1 post to test
|
|
print("Fetching 1 WordPress post to test HTML cleaning...")
|
|
posts = scraper.fetch_content(max_items=1)
|
|
|
|
if posts:
|
|
print(f"✅ Fetched {len(posts)} post")
|
|
|
|
# Generate markdown
|
|
markdown = scraper.format_markdown(posts)
|
|
|
|
# Check for HTML contamination
|
|
import re
|
|
html_tags = re.findall(r'<(?!https?://)[^>]+>', markdown)
|
|
|
|
print(f"\nHTML tag check:")
|
|
if html_tags:
|
|
print(f" ⚠️ Found {len(html_tags)} HTML tags:")
|
|
for tag in html_tags[:10]:
|
|
print(f" - {tag}")
|
|
else:
|
|
print(f" ✅ No HTML tags found - content is clean!")
|
|
|
|
# Check for JavaScript
|
|
js_patterns = [
|
|
r'document\.',
|
|
r'function\s*\(',
|
|
r'gtag\(',
|
|
r'addEventListener'
|
|
]
|
|
|
|
js_found = False
|
|
for pattern in js_patterns:
|
|
if re.search(pattern, markdown):
|
|
print(f" ⚠️ Found JavaScript pattern: {pattern}")
|
|
js_found = True
|
|
|
|
if not js_found:
|
|
print(f" ✅ No JavaScript found - content is clean!")
|
|
|
|
# Save sample
|
|
output_file = Path("test_data/wordpress_clean/sample.md")
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
print(f"\n📄 Sample saved to: {output_file}")
|
|
|
|
# Show preview
|
|
print("\n📝 Content preview (first 500 chars):")
|
|
print("-" * 60)
|
|
print(markdown[:500])
|
|
else:
|
|
print("❌ No posts fetched") |