Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
315 lines
No EOL
12 KiB
Python
Executable file
315 lines
No EOL
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Production Backlog Capture Script
|
|
|
|
This script performs a comprehensive backlog download for ALL sources
|
|
with full media file downloading and NAS synchronization.
|
|
|
|
Features:
|
|
- Downloads complete historical content from all sources
|
|
- Captures all available media files (images, videos, audio)
|
|
- Organizes content by source and date
|
|
- Syncs everything to NAS
|
|
- Provides detailed progress reporting
|
|
- Handles errors gracefully with retry logic
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import logging
|
|
from typing import Dict, Any
|
|
|
|
# Add project to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.orchestrator import ContentOrchestrator
|
|
from src.base_scraper import ScraperConfig
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('production_backlog_capture.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ProductionBacklogCapture:
|
|
"""Handles comprehensive backlog capture for production deployment"""
|
|
|
|
def __init__(self, data_dir: Path = None):
|
|
self.data_dir = data_dir or Path("data_production_backlog")
|
|
self.logs_dir = Path("logs_production_backlog")
|
|
self.start_time = time.time()
|
|
|
|
# Create directories
|
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
self.logs_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize orchestrator
|
|
self.orchestrator = ContentOrchestrator(self.data_dir, self.logs_dir)
|
|
|
|
# Track results
|
|
self.results = {}
|
|
|
|
def capture_source_backlog(self, source_name: str, max_items: int = None) -> Dict[str, Any]:
|
|
"""Capture complete backlog for a specific source"""
|
|
logger.info(f"Starting backlog capture for {source_name}...")
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
scraper = self.orchestrator.scrapers.get(source_name)
|
|
if not scraper:
|
|
logger.error(f"Scraper not found: {source_name}")
|
|
return {"success": False, "error": "Scraper not found", "items": 0}
|
|
|
|
# Clear state for full backlog
|
|
if scraper.state_file.exists():
|
|
scraper.state_file.unlink()
|
|
logger.info(f"Cleared state for {source_name} - full backlog mode")
|
|
|
|
# Fetch content with special handling for each source
|
|
if source_name == "tiktok":
|
|
# TikTok with captions for first 100 videos when fetching 1000
|
|
caption_count = min(100, max_items // 10) if max_items else 50
|
|
items = scraper.fetch_content(
|
|
max_posts=max_items or 200,
|
|
fetch_captions=True,
|
|
max_caption_fetches=caption_count
|
|
)
|
|
elif source_name == "youtube":
|
|
items = scraper.fetch_channel_videos(max_videos=max_items or 100)
|
|
elif source_name == "instagram":
|
|
items = scraper.fetch_content(max_posts=max_items or 100)
|
|
else:
|
|
# RSS sources
|
|
items = scraper.fetch_content(max_items=max_items)
|
|
|
|
if not items:
|
|
logger.warning(f"No items fetched for {source_name}")
|
|
return {"success": True, "items": 0, "duration": time.time() - start_time}
|
|
|
|
logger.info(f"Fetched {len(items)} items for {source_name}")
|
|
|
|
# Download media files for items with media
|
|
media_downloaded = 0
|
|
for i, item in enumerate(items):
|
|
if i % 10 == 0:
|
|
logger.info(f"Processing media for {source_name}: {i}/{len(items)}")
|
|
|
|
# Download media based on item type
|
|
media_urls = []
|
|
|
|
# Extract media URLs from various fields
|
|
if 'image' in item and item['image']:
|
|
media_urls.append((item['image'], 'image'))
|
|
if 'thumbnail' in item and item['thumbnail']:
|
|
media_urls.append((item['thumbnail'], 'image'))
|
|
if 'video_url' in item and item['video_url']:
|
|
media_urls.append((item['video_url'], 'video'))
|
|
if 'audio_link' in item and item['audio_link']:
|
|
media_urls.append((item['audio_link'], 'audio'))
|
|
|
|
# Download each media file
|
|
for url, media_type in media_urls:
|
|
try:
|
|
local_path = scraper.download_media(url, item.get('id', f'item_{i}'), media_type)
|
|
if local_path:
|
|
media_downloaded += 1
|
|
# Add local path to item
|
|
if 'local_media' not in item:
|
|
item['local_media'] = []
|
|
item['local_media'].append(local_path)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to download media {url}: {e}")
|
|
|
|
logger.info(f"Downloaded {media_downloaded} media files for {source_name}")
|
|
|
|
# Generate and save markdown
|
|
markdown = scraper.format_markdown(items)
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"hkia_{source_name}_backlog_{timestamp}.md"
|
|
|
|
# Save to current directory
|
|
current_dir = scraper.config.data_dir / "markdown_current"
|
|
current_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = current_dir / filename
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
|
|
# Update state
|
|
new_state = {
|
|
'last_update': datetime.now().isoformat(),
|
|
'last_item_count': len(items),
|
|
'backlog_captured': True,
|
|
'backlog_timestamp': timestamp
|
|
}
|
|
|
|
if items:
|
|
new_state['last_id'] = items[-1].get('id')
|
|
|
|
scraper.save_state(new_state)
|
|
|
|
duration = time.time() - start_time
|
|
logger.info(f"✅ {source_name}: {len(items)} items, {media_downloaded} media files in {duration:.1f}s")
|
|
|
|
return {
|
|
"success": True,
|
|
"items": len(items),
|
|
"media_files": media_downloaded,
|
|
"duration": duration,
|
|
"output_file": str(output_file)
|
|
}
|
|
|
|
except Exception as e:
|
|
duration = time.time() - start_time
|
|
logger.error(f"❌ {source_name} failed after {duration:.1f}s: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"items": 0,
|
|
"duration": duration
|
|
}
|
|
|
|
def capture_all_backlogs(self) -> Dict[str, Any]:
|
|
"""Capture backlogs for all sources"""
|
|
logger.info("=" * 80)
|
|
logger.info("STARTING PRODUCTION BACKLOG CAPTURE")
|
|
logger.info("=" * 80)
|
|
|
|
# Source configurations with appropriate limits
|
|
sources_config = {
|
|
"wordpress": {"max_items": None}, # All posts
|
|
"mailchimp": {"max_items": None}, # All available (limited by RSS)
|
|
"podcast": {"max_items": None}, # All episodes
|
|
"youtube": {"max_items": 200}, # Last 200 videos
|
|
"instagram": {"max_items": 200}, # Last 200 posts
|
|
"tiktok": {"max_items": 300} # 300 videos with captions for first 50
|
|
}
|
|
|
|
total_items = 0
|
|
total_media = 0
|
|
successful_sources = 0
|
|
|
|
for source_name, config in sources_config.items():
|
|
logger.info(f"\n{'-'*60}")
|
|
logger.info(f"PROCESSING: {source_name.upper()}")
|
|
logger.info(f"{'-'*60}")
|
|
|
|
result = self.capture_source_backlog(source_name, config["max_items"])
|
|
self.results[source_name] = result
|
|
|
|
if result["success"]:
|
|
successful_sources += 1
|
|
total_items += result["items"]
|
|
total_media += result.get("media_files", 0)
|
|
|
|
# Add delay between sources to be respectful
|
|
if source_name != list(sources_config.keys())[-1]: # Not last source
|
|
logger.info("Waiting 30 seconds before next source...")
|
|
time.sleep(30)
|
|
|
|
# Generate summary
|
|
total_duration = time.time() - self.start_time
|
|
|
|
summary = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"total_duration": total_duration,
|
|
"total_items": total_items,
|
|
"total_media_files": total_media,
|
|
"successful_sources": successful_sources,
|
|
"total_sources": len(sources_config),
|
|
"results": self.results
|
|
}
|
|
|
|
# Save summary
|
|
summary_file = self.data_dir / f"backlog_capture_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(summary_file, 'w') as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
logger.info("\n" + "=" * 80)
|
|
logger.info("BACKLOG CAPTURE COMPLETE")
|
|
logger.info("=" * 80)
|
|
logger.info(f"Total items: {total_items:,}")
|
|
logger.info(f"Total media files: {total_media:,}")
|
|
logger.info(f"Successful sources: {successful_sources}/{len(sources_config)}")
|
|
logger.info(f"Total duration: {total_duration/60:.1f} minutes")
|
|
logger.info(f"Summary saved: {summary_file}")
|
|
|
|
return summary
|
|
|
|
def sync_to_nas(self) -> bool:
|
|
"""Sync all captured data to NAS"""
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("SYNCING TO NAS")
|
|
logger.info("=" * 60)
|
|
|
|
try:
|
|
success = self.orchestrator.sync_to_nas()
|
|
if success:
|
|
logger.info("✅ NAS sync completed successfully")
|
|
else:
|
|
logger.error("❌ NAS sync failed")
|
|
return success
|
|
except Exception as e:
|
|
logger.error(f"❌ NAS sync error: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
print("🚀 HKIA - Production Backlog Capture")
|
|
print("=" * 60)
|
|
print("This will download complete historical content from ALL sources")
|
|
print("Including all available media files (images, videos, audio)")
|
|
print("Estimated time: 2-4 hours depending on content volume")
|
|
print("=" * 60)
|
|
|
|
response = input("Proceed with full backlog capture? (y/N): ")
|
|
if response.lower() != 'y':
|
|
print("Backlog capture cancelled.")
|
|
return False
|
|
|
|
# Initialize capture
|
|
capture = ProductionBacklogCapture()
|
|
|
|
# Capture all backlogs
|
|
summary = capture.capture_all_backlogs()
|
|
|
|
# Sync to NAS if any content was captured
|
|
if summary["total_items"] > 0:
|
|
nas_success = capture.sync_to_nas()
|
|
summary["nas_sync_success"] = nas_success
|
|
else:
|
|
logger.warning("No content captured - skipping NAS sync")
|
|
summary["nas_sync_success"] = False
|
|
|
|
# Final summary
|
|
print(f"\n🎉 PRODUCTION BACKLOG CAPTURE COMPLETE!")
|
|
print(f"📊 Summary:")
|
|
print(f" • Total items captured: {summary['total_items']:,}")
|
|
print(f" • Total media files: {summary['total_media_files']:,}")
|
|
print(f" • Sources processed: {summary['successful_sources']}/{summary['total_sources']}")
|
|
print(f" • Duration: {summary['total_duration']/60:.1f} minutes")
|
|
print(f" • NAS sync: {'✅' if summary.get('nas_sync_success') else '❌'}")
|
|
|
|
return summary["successful_sources"] > 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
success = main()
|
|
sys.exit(0 if success else 1)
|
|
except KeyboardInterrupt:
|
|
print("\n\nBacklog capture interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.critical(f"Backlog capture failed: {e}")
|
|
sys.exit(2) |