Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
238 lines
No EOL
7.6 KiB
Python
238 lines
No EOL
7.6 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Production script with cumulative markdown and image downloads.
|
||
Uses cumulative updates for all sources.
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
||
from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails
|
||
from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper
|
||
from src.instagram_scraper_cumulative import InstagramScraperCumulative
|
||
from src.rss_scraper_with_images import RSSScraperPodcastWithImages
|
||
from src.wordpress_scraper import WordPressScraper
|
||
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
|
||
from src.base_scraper import ScraperConfig
|
||
from src.cumulative_markdown_manager import CumulativeMarkdownManager
|
||
from datetime import datetime
|
||
import pytz
|
||
import time
|
||
import logging
|
||
import subprocess
|
||
import os
|
||
|
||
# Set up logging
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler('logs/production_cumulative.log'),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger('production_cumulative')
|
||
|
||
|
||
def get_atlantic_timestamp() -> str:
|
||
"""Get current timestamp in Atlantic timezone for file naming."""
|
||
tz = pytz.timezone('America/Halifax')
|
||
return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S')
|
||
|
||
|
||
def run_instagram_incremental():
|
||
"""Run Instagram incremental update with cumulative markdown."""
|
||
logger.info("=" * 60)
|
||
logger.info("INSTAGRAM INCREMENTAL UPDATE (CUMULATIVE)")
|
||
logger.info("=" * 60)
|
||
|
||
if not os.getenv('INSTAGRAM_USERNAME'):
|
||
logger.warning("Instagram not configured")
|
||
return False, 0, None
|
||
|
||
config = ScraperConfig(
|
||
source_name='Instagram',
|
||
brand_name='hkia',
|
||
data_dir=Path('data'),
|
||
logs_dir=Path('logs'),
|
||
timezone='America/Halifax'
|
||
)
|
||
|
||
try:
|
||
scraper = InstagramScraperCumulative(config)
|
||
return scraper.run_incremental(max_posts=50) # Check for 50 new posts
|
||
except Exception as e:
|
||
logger.error(f"Instagram error: {e}")
|
||
return False, 0, None
|
||
|
||
|
||
def run_youtube_incremental():
|
||
"""Run YouTube incremental update with thumbnails."""
|
||
logger.info("=" * 60)
|
||
logger.info("YOUTUBE INCREMENTAL UPDATE")
|
||
logger.info("=" * 60)
|
||
|
||
config = ScraperConfig(
|
||
source_name='YouTube',
|
||
brand_name='hkia',
|
||
data_dir=Path('data'),
|
||
logs_dir=Path('logs'),
|
||
timezone='America/Halifax'
|
||
)
|
||
|
||
try:
|
||
scraper = YouTubeAPIScraperWithThumbnails(config)
|
||
videos = scraper.fetch_content(max_posts=20) # Check for 20 new videos
|
||
|
||
if videos:
|
||
manager = CumulativeMarkdownManager(config)
|
||
output_file = manager.update_cumulative_file(videos, 'YouTube')
|
||
|
||
thumb_count = sum(1 for v in videos if v.get('local_thumbnail'))
|
||
logger.info(f"✅ YouTube: {len(videos)} videos, {thumb_count} thumbnails")
|
||
return True, len(videos), output_file
|
||
else:
|
||
logger.info("No new YouTube videos")
|
||
return False, 0, None
|
||
|
||
except Exception as e:
|
||
logger.error(f"YouTube error: {e}")
|
||
return False, 0, None
|
||
|
||
|
||
def run_podcast_incremental():
|
||
"""Run Podcast incremental update with thumbnails."""
|
||
logger.info("=" * 60)
|
||
logger.info("PODCAST INCREMENTAL UPDATE")
|
||
logger.info("=" * 60)
|
||
|
||
if not os.getenv('PODCAST_RSS_URL'):
|
||
logger.warning("Podcast not configured")
|
||
return False, 0, None
|
||
|
||
config = ScraperConfig(
|
||
source_name='Podcast',
|
||
brand_name='hkia',
|
||
data_dir=Path('data'),
|
||
logs_dir=Path('logs'),
|
||
timezone='America/Halifax'
|
||
)
|
||
|
||
try:
|
||
scraper = RSSScraperPodcastWithImages(config)
|
||
items = scraper.fetch_content(max_items=10) # Check for 10 new episodes
|
||
|
||
if items:
|
||
manager = CumulativeMarkdownManager(config)
|
||
output_file = manager.update_cumulative_file(items, 'Podcast')
|
||
|
||
thumb_count = sum(1 for item in items if item.get('local_thumbnail'))
|
||
logger.info(f"✅ Podcast: {len(items)} episodes, {thumb_count} thumbnails")
|
||
return True, len(items), output_file
|
||
else:
|
||
logger.info("No new podcast episodes")
|
||
return False, 0, None
|
||
|
||
except Exception as e:
|
||
logger.error(f"Podcast error: {e}")
|
||
return False, 0, None
|
||
|
||
|
||
def sync_to_nas_with_images():
|
||
"""Sync markdown files AND images to NAS."""
|
||
logger.info("\n" + "=" * 60)
|
||
logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES")
|
||
logger.info("=" * 60)
|
||
|
||
nas_base = Path('/mnt/nas/hkia')
|
||
|
||
try:
|
||
# Sync markdown files
|
||
local_current = Path('data/markdown_current')
|
||
nas_current = nas_base / 'markdown_current'
|
||
|
||
if local_current.exists() and any(local_current.glob('*.md')):
|
||
nas_current.mkdir(parents=True, exist_ok=True)
|
||
|
||
cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
|
||
str(local_current) + '/', str(nas_current) + '/']
|
||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||
|
||
if result.returncode == 0:
|
||
logger.info(f"✅ Markdown files synced to NAS")
|
||
else:
|
||
logger.warning(f"Markdown sync warning: {result.stderr}")
|
||
|
||
# Sync media files
|
||
local_media = Path('data/media')
|
||
nas_media = nas_base / 'media'
|
||
|
||
if local_media.exists():
|
||
nas_media.mkdir(parents=True, exist_ok=True)
|
||
|
||
cmd = ['rsync', '-av',
|
||
'--include=*/',
|
||
'--include=*.jpg', '--include=*.jpeg',
|
||
'--include=*.png', '--include=*.gif',
|
||
'--exclude=*',
|
||
str(local_media) + '/', str(nas_media) + '/']
|
||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||
|
||
if result.returncode == 0:
|
||
logger.info(f"✅ Media files synced to NAS")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to sync to NAS: {e}")
|
||
|
||
|
||
def main():
|
||
"""Main production run with cumulative updates and images."""
|
||
logger.info("=" * 70)
|
||
logger.info("HKIA - CUMULATIVE PRODUCTION")
|
||
logger.info("With Image Downloads and Cumulative Markdown")
|
||
logger.info("=" * 70)
|
||
|
||
atlantic_tz = pytz.timezone('America/Halifax')
|
||
start_time = datetime.now(atlantic_tz)
|
||
logger.info(f"Started at: {start_time.isoformat()}")
|
||
|
||
# Track results
|
||
results = {}
|
||
|
||
# Run incremental updates
|
||
success, count, file = run_instagram_incremental()
|
||
results['Instagram'] = {'success': success, 'count': count, 'file': file}
|
||
|
||
time.sleep(2)
|
||
|
||
success, count, file = run_youtube_incremental()
|
||
results['YouTube'] = {'success': success, 'count': count, 'file': file}
|
||
|
||
time.sleep(2)
|
||
|
||
success, count, file = run_podcast_incremental()
|
||
results['Podcast'] = {'success': success, 'count': count, 'file': file}
|
||
|
||
# Also run MailChimp (already has cumulative support)
|
||
# ... (add MailChimp, WordPress, TikTok as needed)
|
||
|
||
# Sync to NAS
|
||
sync_to_nas_with_images()
|
||
|
||
# Summary
|
||
logger.info("\n" + "=" * 60)
|
||
logger.info("PRODUCTION SUMMARY")
|
||
logger.info("=" * 60)
|
||
|
||
for source, result in results.items():
|
||
if result['success']:
|
||
logger.info(f"✅ {source}: {result['count']} items")
|
||
else:
|
||
logger.info(f"ℹ️ {source}: No new items")
|
||
|
||
logger.info("=" * 60)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |