hvac-kia-content/clean_markdown.py
Ben Reed 8b83185130 Fix HTML/XML contamination in WordPress markdown extraction
- Update base_scraper.py convert_to_markdown() to properly clean HTML
- Remove script/style blocks and their content before conversion
- Strip inline JavaScript event handlers
- Clean up br tags and excessive blank lines
- Fix malformed comparison operators that look like tags
- Add comprehensive HTML cleaning during content extraction (not after)
- Test confirms WordPress content now generates clean markdown without HTML

This ensures all future WordPress scraping produces specification-compliant
markdown without any HTML/XML contamination.
2025-08-18 23:11:08 -03:00

135 lines
No EOL
4.1 KiB
Python

#!/usr/bin/env python3
"""
Clean HTML/XML contamination from markdown files
"""
import re
from pathlib import Path
import sys
def clean_html_from_markdown(content: str) -> str:
"""Remove HTML tags and JavaScript from markdown content"""
# Remove script blocks and their content
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL | re.IGNORECASE)
# Remove style blocks and their content
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE)
# Convert <br /> and <br> tags to markdown line breaks
content = re.sub(r'<br\s*/?>','\n', content, flags=re.IGNORECASE)
# Remove any remaining HTML tags (but preserve URLs in angle brackets)
# This regex matches HTML tags but not URLs like <https://...>
content = re.sub(r'<(?!https?://)[^>]+>', '', content)
# Clean up JavaScript code blocks that might remain
lines = content.split('\n')
cleaned_lines = []
in_js_block = False
for line in lines:
# Detect JavaScript patterns
js_patterns = [
r'^\s*document\.',
r'^\s*var\s+\w+\s*=',
r'^\s*function\s*\(',
r'^\s*if\s*\(typeof',
r'^\s*gtag\(',
r'^\s*}\);?\s*$',
r'^\s*{\s*$',
r'^\s*}\s*$'
]
is_js_line = any(re.match(pattern, line) for pattern in js_patterns)
if is_js_line and not in_js_block:
in_js_block = True
continue
elif in_js_block and (line.strip() == '' or line.strip() == '}' or line.strip() == '});'):
if line.strip() in ['}', '});']:
in_js_block = False
continue
elif not in_js_block:
cleaned_lines.append(line)
content = '\n'.join(cleaned_lines)
# Clean up excessive blank lines
content = re.sub(r'\n{3,}', '\n\n', content)
# Fix malformed comparison operators that look like tags
content = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', content)
return content
def process_markdown_file(file_path: Path) -> tuple[int, int]:
"""Process a markdown file and return (original_html_count, cleaned_html_count)"""
content = file_path.read_text(encoding='utf-8')
# Count HTML tags before cleaning
original_html = len(re.findall(r'<(?!https?://)[^>]+>', content))
if original_html == 0:
return 0, 0
print(f"Cleaning {file_path.name}: {original_html} HTML tags found")
# Clean the content
cleaned_content = clean_html_from_markdown(content)
# Count HTML tags after cleaning
cleaned_html = len(re.findall(r'<(?!https?://)[^>]+>', cleaned_content))
# Save cleaned version
backup_path = file_path.with_suffix('.md.backup')
file_path.rename(backup_path)
file_path.write_text(cleaned_content, encoding='utf-8')
print(f" → Cleaned! Backup saved as {backup_path.name}")
print(f" → Remaining HTML tags: {cleaned_html}")
return original_html, cleaned_html
def main():
"""Clean all markdown files in the production backlog directory"""
markdown_dir = Path("data_production_backlog/markdown_current")
if not markdown_dir.exists():
print(f"Error: Directory {markdown_dir} not found")
return False
print("🧹 Cleaning HTML contamination from markdown files")
print("=" * 60)
total_original = 0
total_cleaned = 0
files_processed = 0
for md_file in markdown_dir.glob("*.md"):
if md_file.suffix == '.backup':
continue
original, cleaned = process_markdown_file(md_file)
if original > 0:
total_original += original
total_cleaned += cleaned
files_processed += 1
print()
print("=" * 60)
print(f"✅ Cleaning complete!")
print(f" Files processed: {files_processed}")
print(f" HTML tags removed: {total_original - total_cleaned}")
print(f" Remaining tags: {total_cleaned}")
return True
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)