#!/usr/bin/env python3 """Test WordPress scraper HTML cleaning""" import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.base_scraper import ScraperConfig from src.wordpress_scraper import WordPressScraper # Create test config config = ScraperConfig( source_name="wordpress", brand_name="hvacknowitall", data_dir=Path("test_data/wordpress_clean"), logs_dir=Path("test_logs/wordpress_clean"), timezone="America/Halifax" ) # Initialize scraper scraper = WordPressScraper(config) # Fetch just 1 post to test print("Fetching 1 WordPress post to test HTML cleaning...") posts = scraper.fetch_content(max_items=1) if posts: print(f"✅ Fetched {len(posts)} post") # Generate markdown markdown = scraper.format_markdown(posts) # Check for HTML contamination import re html_tags = re.findall(r'<(?!https?://)[^>]+>', markdown) print(f"\nHTML tag check:") if html_tags: print(f" ⚠️ Found {len(html_tags)} HTML tags:") for tag in html_tags[:10]: print(f" - {tag}") else: print(f" ✅ No HTML tags found - content is clean!") # Check for JavaScript js_patterns = [ r'document\.', r'function\s*\(', r'gtag\(', r'addEventListener' ] js_found = False for pattern in js_patterns: if re.search(pattern, markdown): print(f" ⚠️ Found JavaScript pattern: {pattern}") js_found = True if not js_found: print(f" ✅ No JavaScript found - content is clean!") # Save sample output_file = Path("test_data/wordpress_clean/sample.md") output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(markdown, encoding='utf-8') print(f"\n📄 Sample saved to: {output_file}") # Show preview print("\n📝 Content preview (first 500 chars):") print("-" * 60) print(markdown[:500]) else: print("❌ No posts fetched")