hvac-kia-content/debug_wordpress.py

#!/usr/bin/env python3
"""
Debug WordPress content to see what's causing the conversion failure.
"""

import os
import sys
import json
from pathlib import Path
from dotenv import load_dotenv

# Add src to path
sys.path.insert(0, str(Path(__file__).parent))

from src.base_scraper import ScraperConfig
from src.wordpress_scraper import WordPressScraper


def debug_wordpress():
    """Debug WordPress content fetching."""
    load_dotenv()

    config = ScraperConfig(
        source_name="wordpress",
        brand_name="hvacknowitall",
        data_dir=Path("test_data"),
        logs_dir=Path("test_logs"),
        timezone="America/Halifax"
    )

    scraper = WordPressScraper(config)

    print("Fetching WordPress posts...")
    posts = scraper.fetch_content()

    if posts:
        print(f"\nFetched {len(posts)} posts")

        # Look at first post
        first_post = posts[0]
        print(f"\nFirst post details:")
        print(f"  Title: {first_post.get('title', 'N/A')}")
        print(f"  Date: {first_post.get('date', 'N/A')}")
        print(f"  Link: {first_post.get('link', 'N/A')}")

        # Check content field
        content = first_post.get('content', '')
        print(f"\nContent length: {len(content)} characters")
        print(f"Content type: {type(content)}")

        # Check for problematic characters
        print("\nChecking for problematic bytes...")
        if content:
            # Show first 500 chars
            print("\nFirst 500 characters of content:")
            print("-" * 50)
            print(content[:500])
            print("-" * 50)

            # Look for non-ASCII characters
            non_ascii_positions = []
            for i, char in enumerate(content[:1000]):  # Check first 1000 chars
                if ord(char) > 127:
                    non_ascii_positions.append((i, char, hex(ord(char))))

            if non_ascii_positions:
                print(f"\nFound {len(non_ascii_positions)} non-ASCII characters in first 1000 chars:")
                for pos, char, hex_val in non_ascii_positions[:10]:  # Show first 10
                    print(f"  Position {pos}: '{char}' ({hex_val})")

            # Try to identify the encoding
            print("\nTrying different encodings...")
            if isinstance(content, str):
                # It's already a string, let's see if we can encode it
                try:
                    utf8_bytes = content.encode('utf-8')
                    print(f"✅ UTF-8 encoding works: {len(utf8_bytes)} bytes")
                except UnicodeEncodeError as e:
                    print(f"❌ UTF-8 encoding failed: {e}")

                try:
                    ascii_bytes = content.encode('ascii')
                    print(f"✅ ASCII encoding works: {len(ascii_bytes)} bytes")
                except UnicodeEncodeError as e:
                    print(f"❌ ASCII encoding failed: {e}")
                    # Show the specific problem character
                    problem_pos = e.start
                    problem_char = content[problem_pos]
                    context = content[max(0, problem_pos-20):min(len(content), problem_pos+20)]
                    print(f"   Problem at position {problem_pos}: '{problem_char}' (U+{ord(problem_char):04X})")
                    print(f"   Context: ...{context}...")

            # Save raw content for inspection
            debug_file = Path("test_data/wordpress_raw_content.html")
            debug_file.parent.mkdir(exist_ok=True)
            with open(debug_file, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"\nSaved raw content to {debug_file}")

            # Try the conversion directly
            print("\nTrying MarkItDown conversion...")
            try:
                from markitdown import MarkItDown
                import io

                converter = MarkItDown()

                # Method 1: Direct string
                try:
                    stream = io.BytesIO(content.encode('utf-8'))
                    result = converter.convert_stream(stream)
                    print(f"✅ Direct UTF-8 conversion succeeded")
                    print(f"   Result type: {type(result)}")
                    print(f"   Has text_content: {hasattr(result, 'text_content')}")
                except Exception as e:
                    print(f"❌ Direct UTF-8 conversion failed: {e}")

                # Method 2: With error handling
                try:
                    stream = io.BytesIO(content.encode('utf-8', errors='ignore'))
                    result = converter.convert_stream(stream)
                    print(f"✅ UTF-8 with 'ignore' errors succeeded")
                except Exception as e:
                    print(f"❌ UTF-8 with 'ignore' failed: {e}")

                # Method 3: Latin-1 encoding
                try:
                    stream = io.BytesIO(content.encode('latin-1', errors='ignore'))
                    result = converter.convert_stream(stream)
                    print(f"✅ Latin-1 conversion succeeded")
                except Exception as e:
                    print(f"❌ Latin-1 conversion failed: {e}")

            except ImportError:
                print("❌ MarkItDown not available")
    else:
        print("No posts fetched")


if __name__ == "__main__":
    debug_wordpress()