Production Readiness Improvements: - Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM) - Enabled NAS synchronization in production runner with error handling - Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md) - Made systemd services portable (removed hardcoded user/paths) - Added environment variable validation on startup - Moved DISPLAY/XAUTHORITY to .env configuration Systemd Improvements: - Created template service file (@.service) for any user - Changed all paths to /opt/hvac-kia-content - Updated installation script for portable deployment - Fixed service dependencies and resource limits Documentation: - Created comprehensive PRODUCTION_TODO.md with 25 tasks - Added PRODUCTION_GUIDE.md with deployment instructions - Documented spec compliance gaps (65% complete) Remaining work includes retry logic, connection pooling, media downloads, and pytest test suite as documented in PRODUCTION_TODO.md 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
105 lines
No EOL
3.1 KiB
Python
105 lines
No EOL
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test different approaches to fix MarkItDown conversion.
|
|
"""
|
|
|
|
import json
|
|
from markitdown import MarkItDown
|
|
import io
|
|
|
|
# Load the saved WordPress post
|
|
with open('test_data/wordpress_post_raw.json', 'r', encoding='utf-8') as f:
|
|
post = json.load(f)
|
|
|
|
content_html = post['content']['rendered']
|
|
print(f"Content length: {len(content_html)} characters")
|
|
|
|
# Find the problematic character
|
|
em_dash_pos = content_html.find('—')
|
|
if em_dash_pos != -1:
|
|
print(f"Found em-dash at position {em_dash_pos}")
|
|
print(f"Context: ...{content_html[em_dash_pos-20:em_dash_pos+20]}...")
|
|
|
|
converter = MarkItDown()
|
|
|
|
print("\n" + "="*50)
|
|
print("Testing different conversion approaches:")
|
|
print("="*50)
|
|
|
|
# Test 1: Direct file path approach
|
|
print("\n1. Testing file path approach...")
|
|
try:
|
|
# Save to temp file
|
|
import tempfile
|
|
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.html', delete=False) as f:
|
|
f.write(content_html)
|
|
temp_path = f.name
|
|
|
|
# Try converting from file path
|
|
result = converter.convert(temp_path)
|
|
print(f"✅ File path conversion succeeded!")
|
|
print(f" Result has text_content: {hasattr(result, 'text_content')}")
|
|
|
|
# Clean up
|
|
import os
|
|
os.unlink(temp_path)
|
|
|
|
except Exception as e:
|
|
print(f"❌ File path conversion failed: {e}")
|
|
|
|
# Test 2: Using convert_text if it exists
|
|
print("\n2. Testing direct text conversion...")
|
|
try:
|
|
if hasattr(converter, 'convert_text'):
|
|
result = converter.convert_text(content_html, file_extension='.html')
|
|
print(f"✅ convert_text succeeded!")
|
|
else:
|
|
print("❌ convert_text method not available")
|
|
except Exception as e:
|
|
print(f"❌ convert_text failed: {e}")
|
|
|
|
# Test 3: Try with markdownify directly
|
|
print("\n3. Testing markdownify directly...")
|
|
try:
|
|
from markdownify import markdownify as md
|
|
|
|
# Convert HTML to Markdown
|
|
markdown = md(content_html)
|
|
print(f"✅ markdownify succeeded!")
|
|
print(f" Markdown length: {len(markdown)} characters")
|
|
|
|
# Save the result
|
|
with open('test_data/wordpress_markdownify.md', 'w', encoding='utf-8') as f:
|
|
f.write(markdown)
|
|
print(" Saved to test_data/wordpress_markdownify.md")
|
|
|
|
# Show first 500 chars
|
|
print("\nFirst 500 chars:")
|
|
print("-" * 40)
|
|
print(markdown[:500])
|
|
|
|
except Exception as e:
|
|
print(f"❌ markdownify failed: {e}")
|
|
|
|
# Test 4: Using BeautifulSoup for preprocessing
|
|
print("\n4. Testing with BeautifulSoup preprocessing...")
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Parse and re-encode
|
|
soup = BeautifulSoup(content_html, 'html.parser')
|
|
clean_html = str(soup)
|
|
|
|
# Try conversion on cleaned HTML
|
|
stream = io.BytesIO(clean_html.encode('utf-8'))
|
|
result = converter.convert_stream(stream)
|
|
print(f"✅ BeautifulSoup preprocessing succeeded!")
|
|
|
|
except Exception as e:
|
|
print(f"❌ BeautifulSoup preprocessing failed: {e}")
|
|
|
|
print("\n" + "="*50)
|
|
print("Recommendation:")
|
|
print("="*50)
|
|
print("Use markdownify directly instead of MarkItDown for HTML conversion")
|
|
print("It handles Unicode properly and is more reliable for HTML content") |