Production Readiness Improvements: - Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM) - Enabled NAS synchronization in production runner with error handling - Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md) - Made systemd services portable (removed hardcoded user/paths) - Added environment variable validation on startup - Moved DISPLAY/XAUTHORITY to .env configuration Systemd Improvements: - Created template service file (@.service) for any user - Changed all paths to /opt/hvac-kia-content - Updated installation script for portable deployment - Fixed service dependencies and resource limits Documentation: - Created comprehensive PRODUCTION_TODO.md with 25 tasks - Added PRODUCTION_GUIDE.md with deployment instructions - Documented spec compliance gaps (65% complete) Remaining work includes retry logic, connection pooling, media downloads, and pytest test suite as documented in PRODUCTION_TODO.md 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
141 lines
No EOL
5.4 KiB
Python
141 lines
No EOL
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug WordPress content to see what's causing the conversion failure.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from dotenv import load_dotenv
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.base_scraper import ScraperConfig
|
|
from src.wordpress_scraper import WordPressScraper
|
|
|
|
|
|
def debug_wordpress():
|
|
"""Debug WordPress content fetching."""
|
|
load_dotenv()
|
|
|
|
config = ScraperConfig(
|
|
source_name="wordpress",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("test_data"),
|
|
logs_dir=Path("test_logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
scraper = WordPressScraper(config)
|
|
|
|
print("Fetching WordPress posts...")
|
|
posts = scraper.fetch_content()
|
|
|
|
if posts:
|
|
print(f"\nFetched {len(posts)} posts")
|
|
|
|
# Look at first post
|
|
first_post = posts[0]
|
|
print(f"\nFirst post details:")
|
|
print(f" Title: {first_post.get('title', 'N/A')}")
|
|
print(f" Date: {first_post.get('date', 'N/A')}")
|
|
print(f" Link: {first_post.get('link', 'N/A')}")
|
|
|
|
# Check content field
|
|
content = first_post.get('content', '')
|
|
print(f"\nContent length: {len(content)} characters")
|
|
print(f"Content type: {type(content)}")
|
|
|
|
# Check for problematic characters
|
|
print("\nChecking for problematic bytes...")
|
|
if content:
|
|
# Show first 500 chars
|
|
print("\nFirst 500 characters of content:")
|
|
print("-" * 50)
|
|
print(content[:500])
|
|
print("-" * 50)
|
|
|
|
# Look for non-ASCII characters
|
|
non_ascii_positions = []
|
|
for i, char in enumerate(content[:1000]): # Check first 1000 chars
|
|
if ord(char) > 127:
|
|
non_ascii_positions.append((i, char, hex(ord(char))))
|
|
|
|
if non_ascii_positions:
|
|
print(f"\nFound {len(non_ascii_positions)} non-ASCII characters in first 1000 chars:")
|
|
for pos, char, hex_val in non_ascii_positions[:10]: # Show first 10
|
|
print(f" Position {pos}: '{char}' ({hex_val})")
|
|
|
|
# Try to identify the encoding
|
|
print("\nTrying different encodings...")
|
|
if isinstance(content, str):
|
|
# It's already a string, let's see if we can encode it
|
|
try:
|
|
utf8_bytes = content.encode('utf-8')
|
|
print(f"✅ UTF-8 encoding works: {len(utf8_bytes)} bytes")
|
|
except UnicodeEncodeError as e:
|
|
print(f"❌ UTF-8 encoding failed: {e}")
|
|
|
|
try:
|
|
ascii_bytes = content.encode('ascii')
|
|
print(f"✅ ASCII encoding works: {len(ascii_bytes)} bytes")
|
|
except UnicodeEncodeError as e:
|
|
print(f"❌ ASCII encoding failed: {e}")
|
|
# Show the specific problem character
|
|
problem_pos = e.start
|
|
problem_char = content[problem_pos]
|
|
context = content[max(0, problem_pos-20):min(len(content), problem_pos+20)]
|
|
print(f" Problem at position {problem_pos}: '{problem_char}' (U+{ord(problem_char):04X})")
|
|
print(f" Context: ...{context}...")
|
|
|
|
# Save raw content for inspection
|
|
debug_file = Path("test_data/wordpress_raw_content.html")
|
|
debug_file.parent.mkdir(exist_ok=True)
|
|
with open(debug_file, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
print(f"\nSaved raw content to {debug_file}")
|
|
|
|
# Try the conversion directly
|
|
print("\nTrying MarkItDown conversion...")
|
|
try:
|
|
from markitdown import MarkItDown
|
|
import io
|
|
|
|
converter = MarkItDown()
|
|
|
|
# Method 1: Direct string
|
|
try:
|
|
stream = io.BytesIO(content.encode('utf-8'))
|
|
result = converter.convert_stream(stream)
|
|
print(f"✅ Direct UTF-8 conversion succeeded")
|
|
print(f" Result type: {type(result)}")
|
|
print(f" Has text_content: {hasattr(result, 'text_content')}")
|
|
except Exception as e:
|
|
print(f"❌ Direct UTF-8 conversion failed: {e}")
|
|
|
|
# Method 2: With error handling
|
|
try:
|
|
stream = io.BytesIO(content.encode('utf-8', errors='ignore'))
|
|
result = converter.convert_stream(stream)
|
|
print(f"✅ UTF-8 with 'ignore' errors succeeded")
|
|
except Exception as e:
|
|
print(f"❌ UTF-8 with 'ignore' failed: {e}")
|
|
|
|
# Method 3: Latin-1 encoding
|
|
try:
|
|
stream = io.BytesIO(content.encode('latin-1', errors='ignore'))
|
|
result = converter.convert_stream(stream)
|
|
print(f"✅ Latin-1 conversion succeeded")
|
|
except Exception as e:
|
|
print(f"❌ Latin-1 conversion failed: {e}")
|
|
|
|
except ImportError:
|
|
print("❌ MarkItDown not available")
|
|
else:
|
|
print("No posts fetched")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
debug_wordpress() |