hvac-kia-content/clean_markdown.py

#!/usr/bin/env python3
"""
Clean HTML/XML contamination from markdown files
"""

import re
from pathlib import Path
import sys

def clean_html_from_markdown(content: str) -> str:
    """Remove HTML tags and JavaScript from markdown content"""

    # Remove script blocks and their content
    content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL | re.IGNORECASE)

    # Remove style blocks and their content
    content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE)

    # Convert <br /> and <br> tags to markdown line breaks
    content = re.sub(r'<br\s*/?>','\n', content, flags=re.IGNORECASE)

    # Remove any remaining HTML tags (but preserve URLs in angle brackets)
    # This regex matches HTML tags but not URLs like <https://...>
    content = re.sub(r'<(?!https?://)[^>]+>', '', content)

    # Clean up JavaScript code blocks that might remain
    lines = content.split('\n')
    cleaned_lines = []
    in_js_block = False

    for line in lines:
        # Detect JavaScript patterns
        js_patterns = [
            r'^\s*document\.',
            r'^\s*var\s+\w+\s*=',
            r'^\s*function\s*\(',
            r'^\s*if\s*\(typeof',
            r'^\s*gtag\(',
            r'^\s*}\);?\s*$',
            r'^\s*{\s*$',
            r'^\s*}\s*$'
        ]

        is_js_line = any(re.match(pattern, line) for pattern in js_patterns)

        if is_js_line and not in_js_block:
            in_js_block = True
            continue
        elif in_js_block and (line.strip() == '' or line.strip() == '}' or line.strip() == '});'):
            if line.strip() in ['}', '});']:
                in_js_block = False
            continue
        elif not in_js_block:
            cleaned_lines.append(line)

    content = '\n'.join(cleaned_lines)

    # Clean up excessive blank lines
    content = re.sub(r'\n{3,}', '\n\n', content)

    # Fix malformed comparison operators that look like tags
    content = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', content)

    return content


def process_markdown_file(file_path: Path) -> tuple[int, int]:
    """Process a markdown file and return (original_html_count, cleaned_html_count)"""

    content = file_path.read_text(encoding='utf-8')

    # Count HTML tags before cleaning
    original_html = len(re.findall(r'<(?!https?://)[^>]+>', content))

    if original_html == 0:
        return 0, 0

    print(f"Cleaning {file_path.name}: {original_html} HTML tags found")

    # Clean the content
    cleaned_content = clean_html_from_markdown(content)

    # Count HTML tags after cleaning
    cleaned_html = len(re.findall(r'<(?!https?://)[^>]+>', cleaned_content))

    # Save cleaned version
    backup_path = file_path.with_suffix('.md.backup')
    file_path.rename(backup_path)
    file_path.write_text(cleaned_content, encoding='utf-8')

    print(f"  → Cleaned! Backup saved as {backup_path.name}")
    print(f"  → Remaining HTML tags: {cleaned_html}")

    return original_html, cleaned_html


def main():
    """Clean all markdown files in the production backlog directory"""

    markdown_dir = Path("data_production_backlog/markdown_current")

    if not markdown_dir.exists():
        print(f"Error: Directory {markdown_dir} not found")
        return False

    print("🧹 Cleaning HTML contamination from markdown files")
    print("=" * 60)

    total_original = 0
    total_cleaned = 0
    files_processed = 0

    for md_file in markdown_dir.glob("*.md"):
        if md_file.suffix == '.backup':
            continue

        original, cleaned = process_markdown_file(md_file)
        if original > 0:
            total_original += original
            total_cleaned += cleaned
            files_processed += 1
            print()

    print("=" * 60)
    print(f"✅ Cleaning complete!")
    print(f"   Files processed: {files_processed}")
    print(f"   HTML tags removed: {total_original - total_cleaned}")
    print(f"   Remaining tags: {total_cleaned}")

    return True


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)