#!/usr/bin/env python3 """ Clean HTML/XML contamination from markdown files """ import re from pathlib import Path import sys def clean_html_from_markdown(content: str) -> str: """Remove HTML tags and JavaScript from markdown content""" # Remove script blocks and their content content = re.sub(r']*>.*?', '', content, flags=re.DOTALL | re.IGNORECASE) # Remove style blocks and their content content = re.sub(r']*>.*?', '', content, flags=re.DOTALL | re.IGNORECASE) # Convert
and
tags to markdown line breaks content = re.sub(r'','\n', content, flags=re.IGNORECASE) # Remove any remaining HTML tags (but preserve URLs in angle brackets) # This regex matches HTML tags but not URLs like content = re.sub(r'<(?!https?://)[^>]+>', '', content) # Clean up JavaScript code blocks that might remain lines = content.split('\n') cleaned_lines = [] in_js_block = False for line in lines: # Detect JavaScript patterns js_patterns = [ r'^\s*document\.', r'^\s*var\s+\w+\s*=', r'^\s*function\s*\(', r'^\s*if\s*\(typeof', r'^\s*gtag\(', r'^\s*}\);?\s*$', r'^\s*{\s*$', r'^\s*}\s*$' ] is_js_line = any(re.match(pattern, line) for pattern in js_patterns) if is_js_line and not in_js_block: in_js_block = True continue elif in_js_block and (line.strip() == '' or line.strip() == '}' or line.strip() == '});'): if line.strip() in ['}', '});']: in_js_block = False continue elif not in_js_block: cleaned_lines.append(line) content = '\n'.join(cleaned_lines) # Clean up excessive blank lines content = re.sub(r'\n{3,}', '\n\n', content) # Fix malformed comparison operators that look like tags content = re.sub(r'<(\d+\s*ppm[^>]*)>', r'\1', content) return content def process_markdown_file(file_path: Path) -> tuple[int, int]: """Process a markdown file and return (original_html_count, cleaned_html_count)""" content = file_path.read_text(encoding='utf-8') # Count HTML tags before cleaning original_html = len(re.findall(r'<(?!https?://)[^>]+>', content)) if original_html == 0: return 0, 0 print(f"Cleaning {file_path.name}: {original_html} HTML tags found") # Clean the content cleaned_content = clean_html_from_markdown(content) # Count HTML tags after cleaning cleaned_html = len(re.findall(r'<(?!https?://)[^>]+>', cleaned_content)) # Save cleaned version backup_path = file_path.with_suffix('.md.backup') file_path.rename(backup_path) file_path.write_text(cleaned_content, encoding='utf-8') print(f" โ†’ Cleaned! Backup saved as {backup_path.name}") print(f" โ†’ Remaining HTML tags: {cleaned_html}") return original_html, cleaned_html def main(): """Clean all markdown files in the production backlog directory""" markdown_dir = Path("data_production_backlog/markdown_current") if not markdown_dir.exists(): print(f"Error: Directory {markdown_dir} not found") return False print("๐Ÿงน Cleaning HTML contamination from markdown files") print("=" * 60) total_original = 0 total_cleaned = 0 files_processed = 0 for md_file in markdown_dir.glob("*.md"): if md_file.suffix == '.backup': continue original, cleaned = process_markdown_file(md_file) if original > 0: total_original += original total_cleaned += cleaned files_processed += 1 print() print("=" * 60) print(f"โœ… Cleaning complete!") print(f" Files processed: {files_processed}") print(f" HTML tags removed: {total_original - total_cleaned}") print(f" Remaining tags: {total_cleaned}") return True if __name__ == "__main__": success = main() sys.exit(0 if success else 1)