#!/usr/bin/env python3 """ Debug WordPress raw content without conversion. """ import os import requests from requests.auth import HTTPBasicAuth from dotenv import load_dotenv import json load_dotenv() # Get credentials api_url = os.getenv('WORDPRESS_API_URL') username = os.getenv('WORDPRESS_USERNAME') api_key = os.getenv('WORDPRESS_API_KEY') print(f"API URL: {api_url}") print(f"Username: {username}") print(f"API Key: {api_key[:10]}..." if api_key else "No API key") # Fetch just one post url = f"{api_url}/posts" params = { 'per_page': 1, 'page': 1, '_embed': True } auth = HTTPBasicAuth(username, api_key) if username and api_key else None print(f"\nFetching from: {url}") print(f"Params: {params}") response = requests.get(url, params=params, auth=auth) print(f"Status: {response.status_code}") if response.status_code == 200: posts = response.json() if posts: post = posts[0] # Save full post data with open('test_data/wordpress_post_raw.json', 'w', encoding='utf-8') as f: json.dump(post, f, indent=2, ensure_ascii=False) print(f"\nSaved full post to test_data/wordpress_post_raw.json") # Check the content field if 'content' in post and 'rendered' in post['content']: content = post['content']['rendered'] print(f"\nContent details:") print(f" Type: {type(content)}") print(f" Length: {len(content)} characters") # Show first 500 chars print(f"\nFirst 500 characters:") print("-" * 50) print(content[:500]) print("-" * 50) # Look for problematic characters print("\nChecking for special characters...") special_chars = [] for i, char in enumerate(content): if ord(char) > 127: special_chars.append((i, char, f"U+{ord(char):04X}", char.encode('utf-8', errors='replace'))) if special_chars: print(f"Found {len(special_chars)} non-ASCII characters") print("First 10:") for pos, char, unicode_point, utf8_bytes in special_chars[:10]: print(f" Pos {pos}: '{char}' ({unicode_point}) = {utf8_bytes}") # Save raw HTML content with open('test_data/wordpress_content.html', 'w', encoding='utf-8') as f: f.write(content) print(f"\nSaved raw HTML to test_data/wordpress_content.html") # Test MarkItDown directly print("\nTesting MarkItDown conversion...") from markitdown import MarkItDown import io converter = MarkItDown() # Try conversion try: # Create BytesIO with UTF-8 encoding content_bytes = content.encode('utf-8') print(f"Encoded to UTF-8: {len(content_bytes)} bytes") stream = io.BytesIO(content_bytes) print("Created BytesIO stream") result = converter.convert_stream(stream) print(f"Conversion result type: {type(result)}") print(f"Has text_content: {hasattr(result, 'text_content')}") if hasattr(result, 'text_content'): md_content = result.text_content print(f"Markdown length: {len(md_content)} characters") # Save markdown with open('test_data/wordpress_content.md', 'w', encoding='utf-8') as f: f.write(md_content) print("Saved markdown to test_data/wordpress_content.md") # Show first 500 chars of markdown print("\nFirst 500 chars of markdown:") print("-" * 50) print(md_content[:500]) except Exception as e: print(f"❌ Conversion failed: {e}") import traceback traceback.print_exc() else: print(f"Failed to fetch posts: {response.status_code}") print(response.text)