#!/usr/bin/env python3
"""Test WordPress scraper HTML cleaning"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.base_scraper import ScraperConfig
from src.wordpress_scraper import WordPressScraper

# Create test config
config = ScraperConfig(
    source_name="wordpress",
    brand_name="hvacknowitall",
    data_dir=Path("test_data/wordpress_clean"),
    logs_dir=Path("test_logs/wordpress_clean"),
    timezone="America/Halifax"
)

# Initialize scraper
scraper = WordPressScraper(config)

# Fetch just 1 post to test
print("Fetching 1 WordPress post to test HTML cleaning...")
posts = scraper.fetch_content(max_items=1)

if posts:
    print(f"✅ Fetched {len(posts)} post")
    
    # Generate markdown
    markdown = scraper.format_markdown(posts)
    
    # Check for HTML contamination
    import re
    html_tags = re.findall(r'<(?!https?://)[^>]+>', markdown)
    
    print(f"\nHTML tag check:")
    if html_tags:
        print(f"  ⚠️ Found {len(html_tags)} HTML tags:")
        for tag in html_tags[:10]:
            print(f"    - {tag}")
    else:
        print(f"  ✅ No HTML tags found - content is clean!")
    
    # Check for JavaScript
    js_patterns = [
        r'document\.',
        r'function\s*\(',
        r'gtag\(',
        r'addEventListener'
    ]
    
    js_found = False
    for pattern in js_patterns:
        if re.search(pattern, markdown):
            print(f"  ⚠️ Found JavaScript pattern: {pattern}")
            js_found = True
    
    if not js_found:
        print(f"  ✅ No JavaScript found - content is clean!")
    
    # Save sample
    output_file = Path("test_data/wordpress_clean/sample.md")
    output_file.parent.mkdir(parents=True, exist_ok=True)
    output_file.write_text(markdown, encoding='utf-8')
    print(f"\n📄 Sample saved to: {output_file}")
    
    # Show preview
    print("\n📝 Content preview (first 500 chars):")
    print("-" * 60)
    print(markdown[:500])
else:
    print("❌ No posts fetched")