- Update base_scraper.py convert_to_markdown() to properly clean HTML - Remove script/style blocks and their content before conversion - Strip inline JavaScript event handlers - Clean up br tags and excessive blank lines - Fix malformed comparison operators that look like tags - Add comprehensive HTML cleaning during content extraction (not after) - Test confirms WordPress content now generates clean markdown without HTML This ensures all future WordPress scraping produces specification-compliant markdown without any HTML/XML contamination.
72 lines
No EOL
2.2 KiB
Python
72 lines
No EOL
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Quick backlog test - captures smaller amounts for immediate validation
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from production_backlog_capture import ProductionBacklogCapture
|
|
import logging
|
|
|
|
# Set up logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def main():
|
|
capture = ProductionBacklogCapture(Path("data_quick_test"))
|
|
|
|
# Test each source with limited items
|
|
test_sources = {
|
|
"podcast": 5, # 5 episodes
|
|
"mailchimp": 10, # 10 items (limited by RSS anyway)
|
|
"wordpress": 10, # 10 posts
|
|
"youtube": 5, # 5 videos
|
|
"instagram": 5, # 5 posts
|
|
"tiktok": 10 # 10 videos with captions
|
|
}
|
|
|
|
total_items = 0
|
|
total_media = 0
|
|
|
|
print("🧪 QUICK BACKLOG TEST")
|
|
print("=" * 50)
|
|
|
|
for source, max_items in test_sources.items():
|
|
print(f"\nTesting {source} (max {max_items} items)...")
|
|
result = capture.capture_source_backlog(source, max_items)
|
|
|
|
if result["success"]:
|
|
items = result["items"]
|
|
media = result.get("media_files", 0)
|
|
duration = result["duration"]
|
|
|
|
total_items += items
|
|
total_media += media
|
|
|
|
print(f"✅ {source}: {items} items, {media} media files in {duration:.1f}s")
|
|
else:
|
|
print(f"❌ {source}: {result.get('error', 'Unknown error')}")
|
|
|
|
# Test NAS sync
|
|
print(f"\nTesting NAS sync...")
|
|
if total_items > 0:
|
|
nas_success = capture.sync_to_nas()
|
|
print(f"NAS sync: {'✅' if nas_success else '❌'}")
|
|
|
|
print(f"\n📊 TEST SUMMARY:")
|
|
print(f" Total items: {total_items}")
|
|
print(f" Total media: {total_media}")
|
|
print(f" Data dir: {capture.data_dir}")
|
|
|
|
return total_items > 0
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
success = main()
|
|
print(f"\n🎉 Quick test {'PASSED' if success else 'FAILED'}")
|
|
sys.exit(0 if success else 1)
|
|
except Exception as e:
|
|
print(f"\n❌ Test failed: {e}")
|
|
sys.exit(2) |