hvac-kia-content/quick_backlog_test.py
Ben Reed 8b83185130 Fix HTML/XML contamination in WordPress markdown extraction
- Update base_scraper.py convert_to_markdown() to properly clean HTML
- Remove script/style blocks and their content before conversion
- Strip inline JavaScript event handlers
- Clean up br tags and excessive blank lines
- Fix malformed comparison operators that look like tags
- Add comprehensive HTML cleaning during content extraction (not after)
- Test confirms WordPress content now generates clean markdown without HTML

This ensures all future WordPress scraping produces specification-compliant
markdown without any HTML/XML contamination.
2025-08-18 23:11:08 -03:00

72 lines
No EOL
2.2 KiB
Python

#!/usr/bin/env python3
"""
Quick backlog test - captures smaller amounts for immediate validation
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from production_backlog_capture import ProductionBacklogCapture
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)
def main():
capture = ProductionBacklogCapture(Path("data_quick_test"))
# Test each source with limited items
test_sources = {
"podcast": 5, # 5 episodes
"mailchimp": 10, # 10 items (limited by RSS anyway)
"wordpress": 10, # 10 posts
"youtube": 5, # 5 videos
"instagram": 5, # 5 posts
"tiktok": 10 # 10 videos with captions
}
total_items = 0
total_media = 0
print("🧪 QUICK BACKLOG TEST")
print("=" * 50)
for source, max_items in test_sources.items():
print(f"\nTesting {source} (max {max_items} items)...")
result = capture.capture_source_backlog(source, max_items)
if result["success"]:
items = result["items"]
media = result.get("media_files", 0)
duration = result["duration"]
total_items += items
total_media += media
print(f"{source}: {items} items, {media} media files in {duration:.1f}s")
else:
print(f"{source}: {result.get('error', 'Unknown error')}")
# Test NAS sync
print(f"\nTesting NAS sync...")
if total_items > 0:
nas_success = capture.sync_to_nas()
print(f"NAS sync: {'' if nas_success else ''}")
print(f"\n📊 TEST SUMMARY:")
print(f" Total items: {total_items}")
print(f" Total media: {total_media}")
print(f" Data dir: {capture.data_dir}")
return total_items > 0
if __name__ == "__main__":
try:
success = main()
print(f"\n🎉 Quick test {'PASSED' if success else 'FAILED'}")
sys.exit(0 if success else 1)
except Exception as e:
print(f"\n❌ Test failed: {e}")
sys.exit(2)