#!/usr/bin/env python3 """ Test different approaches to fix MarkItDown conversion. """ import json from markitdown import MarkItDown import io # Load the saved WordPress post with open('test_data/wordpress_post_raw.json', 'r', encoding='utf-8') as f: post = json.load(f) content_html = post['content']['rendered'] print(f"Content length: {len(content_html)} characters") # Find the problematic character em_dash_pos = content_html.find('—') if em_dash_pos != -1: print(f"Found em-dash at position {em_dash_pos}") print(f"Context: ...{content_html[em_dash_pos-20:em_dash_pos+20]}...") converter = MarkItDown() print("\n" + "="*50) print("Testing different conversion approaches:") print("="*50) # Test 1: Direct file path approach print("\n1. Testing file path approach...") try: # Save to temp file import tempfile with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.html', delete=False) as f: f.write(content_html) temp_path = f.name # Try converting from file path result = converter.convert(temp_path) print(f"✅ File path conversion succeeded!") print(f" Result has text_content: {hasattr(result, 'text_content')}") # Clean up import os os.unlink(temp_path) except Exception as e: print(f"❌ File path conversion failed: {e}") # Test 2: Using convert_text if it exists print("\n2. Testing direct text conversion...") try: if hasattr(converter, 'convert_text'): result = converter.convert_text(content_html, file_extension='.html') print(f"✅ convert_text succeeded!") else: print("❌ convert_text method not available") except Exception as e: print(f"❌ convert_text failed: {e}") # Test 3: Try with markdownify directly print("\n3. Testing markdownify directly...") try: from markdownify import markdownify as md # Convert HTML to Markdown markdown = md(content_html) print(f"✅ markdownify succeeded!") print(f" Markdown length: {len(markdown)} characters") # Save the result with open('test_data/wordpress_markdownify.md', 'w', encoding='utf-8') as f: f.write(markdown) print(" Saved to test_data/wordpress_markdownify.md") # Show first 500 chars print("\nFirst 500 chars:") print("-" * 40) print(markdown[:500]) except Exception as e: print(f"❌ markdownify failed: {e}") # Test 4: Using BeautifulSoup for preprocessing print("\n4. Testing with BeautifulSoup preprocessing...") try: from bs4 import BeautifulSoup # Parse and re-encode soup = BeautifulSoup(content_html, 'html.parser') clean_html = str(soup) # Try conversion on cleaned HTML stream = io.BytesIO(clean_html.encode('utf-8')) result = converter.convert_stream(stream) print(f"✅ BeautifulSoup preprocessing succeeded!") except Exception as e: print(f"❌ BeautifulSoup preprocessing failed: {e}") print("\n" + "="*50) print("Recommendation:") print("="*50) print("Use markdownify directly instead of MarkItDown for HTML conversion") print("It handles Unicode properly and is more reliable for HTML content")