- Update base_scraper.py convert_to_markdown() to properly clean HTML - Remove script/style blocks and their content before conversion - Strip inline JavaScript event handlers - Clean up br tags and excessive blank lines - Fix malformed comparison operators that look like tags - Add comprehensive HTML cleaning during content extraction (not after) - Test confirms WordPress content now generates clean markdown without HTML This ensures all future WordPress scraping produces specification-compliant markdown without any HTML/XML contamination.
135 lines
No EOL
4.1 KiB
Python
135 lines
No EOL
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Clean HTML/XML contamination from markdown files
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
def clean_html_from_markdown(content: str) -> str:
|
|
"""Remove HTML tags and JavaScript from markdown content"""
|
|
|
|
# Remove script blocks and their content
|
|
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
# Remove style blocks and their content
|
|
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
# Convert <br /> and <br> tags to markdown line breaks
|
|
content = re.sub(r'<br\s*/?>','\n', content, flags=re.IGNORECASE)
|
|
|
|
# Remove any remaining HTML tags (but preserve URLs in angle brackets)
|
|
# This regex matches HTML tags but not URLs like <https://...>
|
|
content = re.sub(r'<(?!https?://)[^>]+>', '', content)
|
|
|
|
# Clean up JavaScript code blocks that might remain
|
|
lines = content.split('\n')
|
|
cleaned_lines = []
|
|
in_js_block = False
|
|
|
|
for line in lines:
|
|
# Detect JavaScript patterns
|
|
js_patterns = [
|
|
r'^\s*document\.',
|
|
r'^\s*var\s+\w+\s*=',
|
|
r'^\s*function\s*\(',
|
|
r'^\s*if\s*\(typeof',
|
|
r'^\s*gtag\(',
|
|
r'^\s*}\);?\s*$',
|
|
r'^\s*{\s*$',
|
|
r'^\s*}\s*$'
|
|
]
|
|
|
|
is_js_line = any(re.match(pattern, line) for pattern in js_patterns)
|
|
|
|
if is_js_line and not in_js_block:
|
|
in_js_block = True
|
|
continue
|
|
elif in_js_block and (line.strip() == '' or line.strip() == '}' or line.strip() == '});'):
|
|
if line.strip() in ['}', '});']:
|
|
in_js_block = False
|
|
continue
|
|
elif not in_js_block:
|
|
cleaned_lines.append(line)
|
|
|
|
content = '\n'.join(cleaned_lines)
|
|
|
|
# Clean up excessive blank lines
|
|
content = re.sub(r'\n{3,}', '\n\n', content)
|
|
|
|
# Fix malformed comparison operators that look like tags
|
|
content = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', content)
|
|
|
|
return content
|
|
|
|
|
|
def process_markdown_file(file_path: Path) -> tuple[int, int]:
|
|
"""Process a markdown file and return (original_html_count, cleaned_html_count)"""
|
|
|
|
content = file_path.read_text(encoding='utf-8')
|
|
|
|
# Count HTML tags before cleaning
|
|
original_html = len(re.findall(r'<(?!https?://)[^>]+>', content))
|
|
|
|
if original_html == 0:
|
|
return 0, 0
|
|
|
|
print(f"Cleaning {file_path.name}: {original_html} HTML tags found")
|
|
|
|
# Clean the content
|
|
cleaned_content = clean_html_from_markdown(content)
|
|
|
|
# Count HTML tags after cleaning
|
|
cleaned_html = len(re.findall(r'<(?!https?://)[^>]+>', cleaned_content))
|
|
|
|
# Save cleaned version
|
|
backup_path = file_path.with_suffix('.md.backup')
|
|
file_path.rename(backup_path)
|
|
file_path.write_text(cleaned_content, encoding='utf-8')
|
|
|
|
print(f" → Cleaned! Backup saved as {backup_path.name}")
|
|
print(f" → Remaining HTML tags: {cleaned_html}")
|
|
|
|
return original_html, cleaned_html
|
|
|
|
|
|
def main():
|
|
"""Clean all markdown files in the production backlog directory"""
|
|
|
|
markdown_dir = Path("data_production_backlog/markdown_current")
|
|
|
|
if not markdown_dir.exists():
|
|
print(f"Error: Directory {markdown_dir} not found")
|
|
return False
|
|
|
|
print("🧹 Cleaning HTML contamination from markdown files")
|
|
print("=" * 60)
|
|
|
|
total_original = 0
|
|
total_cleaned = 0
|
|
files_processed = 0
|
|
|
|
for md_file in markdown_dir.glob("*.md"):
|
|
if md_file.suffix == '.backup':
|
|
continue
|
|
|
|
original, cleaned = process_markdown_file(md_file)
|
|
if original > 0:
|
|
total_original += original
|
|
total_cleaned += cleaned
|
|
files_processed += 1
|
|
print()
|
|
|
|
print("=" * 60)
|
|
print(f"✅ Cleaning complete!")
|
|
print(f" Files processed: {files_processed}")
|
|
print(f" HTML tags removed: {total_original - total_cleaned}")
|
|
print(f" Remaining tags: {total_cleaned}")
|
|
|
|
return True
|
|
|
|
|
|
if __name__ == "__main__":
|
|
success = main()
|
|
sys.exit(0 if success else 1) |