hvac-kia-content/debug_wordpress_raw.py
Ben Reed 05218a873b Fix critical production issues and improve spec compliance
Production Readiness Improvements:
- Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM)
- Enabled NAS synchronization in production runner with error handling
- Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md)
- Made systemd services portable (removed hardcoded user/paths)
- Added environment variable validation on startup
- Moved DISPLAY/XAUTHORITY to .env configuration

Systemd Improvements:
- Created template service file (@.service) for any user
- Changed all paths to /opt/hvac-kia-content
- Updated installation script for portable deployment
- Fixed service dependencies and resource limits

Documentation:
- Created comprehensive PRODUCTION_TODO.md with 25 tasks
- Added PRODUCTION_GUIDE.md with deployment instructions
- Documented spec compliance gaps (65% complete)

Remaining work includes retry logic, connection pooling, media downloads,
and pytest test suite as documented in PRODUCTION_TODO.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 20:07:55 -03:00

123 lines
No EOL
4.3 KiB
Python

#!/usr/bin/env python3
"""
Debug WordPress raw content without conversion.
"""
import os
import requests
from requests.auth import HTTPBasicAuth
from dotenv import load_dotenv
import json
load_dotenv()
# Get credentials
api_url = os.getenv('WORDPRESS_API_URL')
username = os.getenv('WORDPRESS_USERNAME')
api_key = os.getenv('WORDPRESS_API_KEY')
print(f"API URL: {api_url}")
print(f"Username: {username}")
print(f"API Key: {api_key[:10]}..." if api_key else "No API key")
# Fetch just one post
url = f"{api_url}/posts"
params = {
'per_page': 1,
'page': 1,
'_embed': True
}
auth = HTTPBasicAuth(username, api_key) if username and api_key else None
print(f"\nFetching from: {url}")
print(f"Params: {params}")
response = requests.get(url, params=params, auth=auth)
print(f"Status: {response.status_code}")
if response.status_code == 200:
posts = response.json()
if posts:
post = posts[0]
# Save full post data
with open('test_data/wordpress_post_raw.json', 'w', encoding='utf-8') as f:
json.dump(post, f, indent=2, ensure_ascii=False)
print(f"\nSaved full post to test_data/wordpress_post_raw.json")
# Check the content field
if 'content' in post and 'rendered' in post['content']:
content = post['content']['rendered']
print(f"\nContent details:")
print(f" Type: {type(content)}")
print(f" Length: {len(content)} characters")
# Show first 500 chars
print(f"\nFirst 500 characters:")
print("-" * 50)
print(content[:500])
print("-" * 50)
# Look for problematic characters
print("\nChecking for special characters...")
special_chars = []
for i, char in enumerate(content):
if ord(char) > 127:
special_chars.append((i, char, f"U+{ord(char):04X}", char.encode('utf-8', errors='replace')))
if special_chars:
print(f"Found {len(special_chars)} non-ASCII characters")
print("First 10:")
for pos, char, unicode_point, utf8_bytes in special_chars[:10]:
print(f" Pos {pos}: '{char}' ({unicode_point}) = {utf8_bytes}")
# Save raw HTML content
with open('test_data/wordpress_content.html', 'w', encoding='utf-8') as f:
f.write(content)
print(f"\nSaved raw HTML to test_data/wordpress_content.html")
# Test MarkItDown directly
print("\nTesting MarkItDown conversion...")
from markitdown import MarkItDown
import io
converter = MarkItDown()
# Try conversion
try:
# Create BytesIO with UTF-8 encoding
content_bytes = content.encode('utf-8')
print(f"Encoded to UTF-8: {len(content_bytes)} bytes")
stream = io.BytesIO(content_bytes)
print("Created BytesIO stream")
result = converter.convert_stream(stream)
print(f"Conversion result type: {type(result)}")
print(f"Has text_content: {hasattr(result, 'text_content')}")
if hasattr(result, 'text_content'):
md_content = result.text_content
print(f"Markdown length: {len(md_content)} characters")
# Save markdown
with open('test_data/wordpress_content.md', 'w', encoding='utf-8') as f:
f.write(md_content)
print("Saved markdown to test_data/wordpress_content.md")
# Show first 500 chars of markdown
print("\nFirst 500 chars of markdown:")
print("-" * 50)
print(md_content[:500])
except Exception as e:
print(f"❌ Conversion failed: {e}")
import traceback
traceback.print_exc()
else:
print(f"Failed to fetch posts: {response.status_code}")
print(response.text)