#!/usr/bin/env python3 """ Debug WordPress content to see what's causing the conversion failure. """ import os import sys import json from pathlib import Path from dotenv import load_dotenv # Add src to path sys.path.insert(0, str(Path(__file__).parent)) from src.base_scraper import ScraperConfig from src.wordpress_scraper import WordPressScraper def debug_wordpress(): """Debug WordPress content fetching.""" load_dotenv() config = ScraperConfig( source_name="wordpress", brand_name="hvacknowitall", data_dir=Path("test_data"), logs_dir=Path("test_logs"), timezone="America/Halifax" ) scraper = WordPressScraper(config) print("Fetching WordPress posts...") posts = scraper.fetch_content() if posts: print(f"\nFetched {len(posts)} posts") # Look at first post first_post = posts[0] print(f"\nFirst post details:") print(f" Title: {first_post.get('title', 'N/A')}") print(f" Date: {first_post.get('date', 'N/A')}") print(f" Link: {first_post.get('link', 'N/A')}") # Check content field content = first_post.get('content', '') print(f"\nContent length: {len(content)} characters") print(f"Content type: {type(content)}") # Check for problematic characters print("\nChecking for problematic bytes...") if content: # Show first 500 chars print("\nFirst 500 characters of content:") print("-" * 50) print(content[:500]) print("-" * 50) # Look for non-ASCII characters non_ascii_positions = [] for i, char in enumerate(content[:1000]): # Check first 1000 chars if ord(char) > 127: non_ascii_positions.append((i, char, hex(ord(char)))) if non_ascii_positions: print(f"\nFound {len(non_ascii_positions)} non-ASCII characters in first 1000 chars:") for pos, char, hex_val in non_ascii_positions[:10]: # Show first 10 print(f" Position {pos}: '{char}' ({hex_val})") # Try to identify the encoding print("\nTrying different encodings...") if isinstance(content, str): # It's already a string, let's see if we can encode it try: utf8_bytes = content.encode('utf-8') print(f"✅ UTF-8 encoding works: {len(utf8_bytes)} bytes") except UnicodeEncodeError as e: print(f"❌ UTF-8 encoding failed: {e}") try: ascii_bytes = content.encode('ascii') print(f"✅ ASCII encoding works: {len(ascii_bytes)} bytes") except UnicodeEncodeError as e: print(f"❌ ASCII encoding failed: {e}") # Show the specific problem character problem_pos = e.start problem_char = content[problem_pos] context = content[max(0, problem_pos-20):min(len(content), problem_pos+20)] print(f" Problem at position {problem_pos}: '{problem_char}' (U+{ord(problem_char):04X})") print(f" Context: ...{context}...") # Save raw content for inspection debug_file = Path("test_data/wordpress_raw_content.html") debug_file.parent.mkdir(exist_ok=True) with open(debug_file, 'w', encoding='utf-8') as f: f.write(content) print(f"\nSaved raw content to {debug_file}") # Try the conversion directly print("\nTrying MarkItDown conversion...") try: from markitdown import MarkItDown import io converter = MarkItDown() # Method 1: Direct string try: stream = io.BytesIO(content.encode('utf-8')) result = converter.convert_stream(stream) print(f"✅ Direct UTF-8 conversion succeeded") print(f" Result type: {type(result)}") print(f" Has text_content: {hasattr(result, 'text_content')}") except Exception as e: print(f"❌ Direct UTF-8 conversion failed: {e}") # Method 2: With error handling try: stream = io.BytesIO(content.encode('utf-8', errors='ignore')) result = converter.convert_stream(stream) print(f"✅ UTF-8 with 'ignore' errors succeeded") except Exception as e: print(f"❌ UTF-8 with 'ignore' failed: {e}") # Method 3: Latin-1 encoding try: stream = io.BytesIO(content.encode('latin-1', errors='ignore')) result = converter.convert_stream(stream) print(f"✅ Latin-1 conversion succeeded") except Exception as e: print(f"❌ Latin-1 conversion failed: {e}") except ImportError: print("❌ MarkItDown not available") else: print("No posts fetched") if __name__ == "__main__": debug_wordpress()