#!/usr/bin/env python3
"""
Clean HTML/XML contamination from markdown files
"""

import re
from pathlib import Path
import sys

def clean_html_from_markdown(content: str) -> str:
    """Remove HTML tags and JavaScript from markdown content"""
    
    # Remove script blocks and their content
    content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL | re.IGNORECASE)
    
    # Remove style blocks and their content
    content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE)
    
    # Convert <br /> and <br> tags to markdown line breaks
    content = re.sub(r'<br\s*/?>','\n', content, flags=re.IGNORECASE)
    
    # Remove any remaining HTML tags (but preserve URLs in angle brackets)
    # This regex matches HTML tags but not URLs like <https://...>
    content = re.sub(r'<(?!https?://)[^>]+>', '', content)
    
    # Clean up JavaScript code blocks that might remain
    lines = content.split('\n')
    cleaned_lines = []
    in_js_block = False
    
    for line in lines:
        # Detect JavaScript patterns
        js_patterns = [
            r'^\s*document\.',
            r'^\s*var\s+\w+\s*=',
            r'^\s*function\s*\(',
            r'^\s*if\s*\(typeof',
            r'^\s*gtag\(',
            r'^\s*}\);?\s*$',
            r'^\s*{\s*$',
            r'^\s*}\s*$'
        ]
        
        is_js_line = any(re.match(pattern, line) for pattern in js_patterns)
        
        if is_js_line and not in_js_block:
            in_js_block = True
            continue
        elif in_js_block and (line.strip() == '' or line.strip() == '}' or line.strip() == '});'):
            if line.strip() in ['}', '});']:
                in_js_block = False
            continue
        elif not in_js_block:
            cleaned_lines.append(line)
    
    content = '\n'.join(cleaned_lines)
    
    # Clean up excessive blank lines
    content = re.sub(r'\n{3,}', '\n\n', content)
    
    # Fix malformed comparison operators that look like tags
    content = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', content)
    
    return content


def process_markdown_file(file_path: Path) -> tuple[int, int]:
    """Process a markdown file and return (original_html_count, cleaned_html_count)"""
    
    content = file_path.read_text(encoding='utf-8')
    
    # Count HTML tags before cleaning
    original_html = len(re.findall(r'<(?!https?://)[^>]+>', content))
    
    if original_html == 0:
        return 0, 0
    
    print(f"Cleaning {file_path.name}: {original_html} HTML tags found")
    
    # Clean the content
    cleaned_content = clean_html_from_markdown(content)
    
    # Count HTML tags after cleaning
    cleaned_html = len(re.findall(r'<(?!https?://)[^>]+>', cleaned_content))
    
    # Save cleaned version
    backup_path = file_path.with_suffix('.md.backup')
    file_path.rename(backup_path)
    file_path.write_text(cleaned_content, encoding='utf-8')
    
    print(f"  → Cleaned! Backup saved as {backup_path.name}")
    print(f"  → Remaining HTML tags: {cleaned_html}")
    
    return original_html, cleaned_html


def main():
    """Clean all markdown files in the production backlog directory"""
    
    markdown_dir = Path("data_production_backlog/markdown_current")
    
    if not markdown_dir.exists():
        print(f"Error: Directory {markdown_dir} not found")
        return False
    
    print("🧹 Cleaning HTML contamination from markdown files")
    print("=" * 60)
    
    total_original = 0
    total_cleaned = 0
    files_processed = 0
    
    for md_file in markdown_dir.glob("*.md"):
        if md_file.suffix == '.backup':
            continue
            
        original, cleaned = process_markdown_file(md_file)
        if original > 0:
            total_original += original
            total_cleaned += cleaned
            files_processed += 1
            print()
    
    print("=" * 60)
    print(f"✅ Cleaning complete!")
    print(f"   Files processed: {files_processed}")
    print(f"   HTML tags removed: {total_original - total_cleaned}")
    print(f"   Remaining tags: {total_cleaned}")
    
    return True


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)