hvac-kia-content/test_markitdown_fix.py

#!/usr/bin/env python3
"""
Test different approaches to fix MarkItDown conversion.
"""

import json
from markitdown import MarkItDown
import io

# Load the saved WordPress post
with open('test_data/wordpress_post_raw.json', 'r', encoding='utf-8') as f:
    post = json.load(f)

content_html = post['content']['rendered']
print(f"Content length: {len(content_html)} characters")

# Find the problematic character
em_dash_pos = content_html.find('—')
if em_dash_pos != -1:
    print(f"Found em-dash at position {em_dash_pos}")
    print(f"Context: ...{content_html[em_dash_pos-20:em_dash_pos+20]}...")

converter = MarkItDown()

print("\n" + "="*50)
print("Testing different conversion approaches:")
print("="*50)

# Test 1: Direct file path approach
print("\n1. Testing file path approach...")
try:
    # Save to temp file
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.html', delete=False) as f:
        f.write(content_html)
        temp_path = f.name

    # Try converting from file path
    result = converter.convert(temp_path)
    print(f"✅ File path conversion succeeded!")
    print(f"   Result has text_content: {hasattr(result, 'text_content')}")

    # Clean up
    import os
    os.unlink(temp_path)

except Exception as e:
    print(f"❌ File path conversion failed: {e}")

# Test 2: Using convert_text if it exists
print("\n2. Testing direct text conversion...")
try:
    if hasattr(converter, 'convert_text'):
        result = converter.convert_text(content_html, file_extension='.html')
        print(f"✅ convert_text succeeded!")
    else:
        print("❌ convert_text method not available")
except Exception as e:
    print(f"❌ convert_text failed: {e}")

# Test 3: Try with markdownify directly
print("\n3. Testing markdownify directly...")
try:
    from markdownify import markdownify as md

    # Convert HTML to Markdown
    markdown = md(content_html)
    print(f"✅ markdownify succeeded!")
    print(f"   Markdown length: {len(markdown)} characters")

    # Save the result
    with open('test_data/wordpress_markdownify.md', 'w', encoding='utf-8') as f:
        f.write(markdown)
    print("   Saved to test_data/wordpress_markdownify.md")

    # Show first 500 chars
    print("\nFirst 500 chars:")
    print("-" * 40)
    print(markdown[:500])

except Exception as e:
    print(f"❌ markdownify failed: {e}")

# Test 4: Using BeautifulSoup for preprocessing
print("\n4. Testing with BeautifulSoup preprocessing...")
try:
    from bs4 import BeautifulSoup

    # Parse and re-encode
    soup = BeautifulSoup(content_html, 'html.parser')
    clean_html = str(soup)

    # Try conversion on cleaned HTML
    stream = io.BytesIO(clean_html.encode('utf-8'))
    result = converter.convert_stream(stream)
    print(f"✅ BeautifulSoup preprocessing succeeded!")

except Exception as e:
    print(f"❌ BeautifulSoup preprocessing failed: {e}")

print("\n" + "="*50)
print("Recommendation:")
print("="*50)
print("Use markdownify directly instead of MarkItDown for HTML conversion")
print("It handles Unicode properly and is more reliable for HTML content")