hvac-kia-content/debug_wordpress.py
Ben Reed 05218a873b Fix critical production issues and improve spec compliance
Production Readiness Improvements:
- Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM)
- Enabled NAS synchronization in production runner with error handling
- Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md)
- Made systemd services portable (removed hardcoded user/paths)
- Added environment variable validation on startup
- Moved DISPLAY/XAUTHORITY to .env configuration

Systemd Improvements:
- Created template service file (@.service) for any user
- Changed all paths to /opt/hvac-kia-content
- Updated installation script for portable deployment
- Fixed service dependencies and resource limits

Documentation:
- Created comprehensive PRODUCTION_TODO.md with 25 tasks
- Added PRODUCTION_GUIDE.md with deployment instructions
- Documented spec compliance gaps (65% complete)

Remaining work includes retry logic, connection pooling, media downloads,
and pytest test suite as documented in PRODUCTION_TODO.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 20:07:55 -03:00

141 lines
No EOL
5.4 KiB
Python

#!/usr/bin/env python3
"""
Debug WordPress content to see what's causing the conversion failure.
"""
import os
import sys
import json
from pathlib import Path
from dotenv import load_dotenv
# Add src to path
sys.path.insert(0, str(Path(__file__).parent))
from src.base_scraper import ScraperConfig
from src.wordpress_scraper import WordPressScraper
def debug_wordpress():
"""Debug WordPress content fetching."""
load_dotenv()
config = ScraperConfig(
source_name="wordpress",
brand_name="hvacknowitall",
data_dir=Path("test_data"),
logs_dir=Path("test_logs"),
timezone="America/Halifax"
)
scraper = WordPressScraper(config)
print("Fetching WordPress posts...")
posts = scraper.fetch_content()
if posts:
print(f"\nFetched {len(posts)} posts")
# Look at first post
first_post = posts[0]
print(f"\nFirst post details:")
print(f" Title: {first_post.get('title', 'N/A')}")
print(f" Date: {first_post.get('date', 'N/A')}")
print(f" Link: {first_post.get('link', 'N/A')}")
# Check content field
content = first_post.get('content', '')
print(f"\nContent length: {len(content)} characters")
print(f"Content type: {type(content)}")
# Check for problematic characters
print("\nChecking for problematic bytes...")
if content:
# Show first 500 chars
print("\nFirst 500 characters of content:")
print("-" * 50)
print(content[:500])
print("-" * 50)
# Look for non-ASCII characters
non_ascii_positions = []
for i, char in enumerate(content[:1000]): # Check first 1000 chars
if ord(char) > 127:
non_ascii_positions.append((i, char, hex(ord(char))))
if non_ascii_positions:
print(f"\nFound {len(non_ascii_positions)} non-ASCII characters in first 1000 chars:")
for pos, char, hex_val in non_ascii_positions[:10]: # Show first 10
print(f" Position {pos}: '{char}' ({hex_val})")
# Try to identify the encoding
print("\nTrying different encodings...")
if isinstance(content, str):
# It's already a string, let's see if we can encode it
try:
utf8_bytes = content.encode('utf-8')
print(f"✅ UTF-8 encoding works: {len(utf8_bytes)} bytes")
except UnicodeEncodeError as e:
print(f"❌ UTF-8 encoding failed: {e}")
try:
ascii_bytes = content.encode('ascii')
print(f"✅ ASCII encoding works: {len(ascii_bytes)} bytes")
except UnicodeEncodeError as e:
print(f"❌ ASCII encoding failed: {e}")
# Show the specific problem character
problem_pos = e.start
problem_char = content[problem_pos]
context = content[max(0, problem_pos-20):min(len(content), problem_pos+20)]
print(f" Problem at position {problem_pos}: '{problem_char}' (U+{ord(problem_char):04X})")
print(f" Context: ...{context}...")
# Save raw content for inspection
debug_file = Path("test_data/wordpress_raw_content.html")
debug_file.parent.mkdir(exist_ok=True)
with open(debug_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f"\nSaved raw content to {debug_file}")
# Try the conversion directly
print("\nTrying MarkItDown conversion...")
try:
from markitdown import MarkItDown
import io
converter = MarkItDown()
# Method 1: Direct string
try:
stream = io.BytesIO(content.encode('utf-8'))
result = converter.convert_stream(stream)
print(f"✅ Direct UTF-8 conversion succeeded")
print(f" Result type: {type(result)}")
print(f" Has text_content: {hasattr(result, 'text_content')}")
except Exception as e:
print(f"❌ Direct UTF-8 conversion failed: {e}")
# Method 2: With error handling
try:
stream = io.BytesIO(content.encode('utf-8', errors='ignore'))
result = converter.convert_stream(stream)
print(f"✅ UTF-8 with 'ignore' errors succeeded")
except Exception as e:
print(f"❌ UTF-8 with 'ignore' failed: {e}")
# Method 3: Latin-1 encoding
try:
stream = io.BytesIO(content.encode('latin-1', errors='ignore'))
result = converter.convert_stream(stream)
print(f"✅ Latin-1 conversion succeeded")
except Exception as e:
print(f"❌ Latin-1 conversion failed: {e}")
except ImportError:
print("❌ MarkItDown not available")
else:
print("No posts fetched")
if __name__ == "__main__":
debug_wordpress()