hvac-kia-content/run_production_cumulative.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

238 lines
No EOL
7.6 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Production script with cumulative markdown and image downloads.
Uses cumulative updates for all sources.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails
from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper
from src.instagram_scraper_cumulative import InstagramScraperCumulative
from src.rss_scraper_with_images import RSSScraperPodcastWithImages
from src.wordpress_scraper import WordPressScraper
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
from src.base_scraper import ScraperConfig
from src.cumulative_markdown_manager import CumulativeMarkdownManager
from datetime import datetime
import pytz
import time
import logging
import subprocess
import os
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('logs/production_cumulative.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger('production_cumulative')
def get_atlantic_timestamp() -> str:
"""Get current timestamp in Atlantic timezone for file naming."""
tz = pytz.timezone('America/Halifax')
return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S')
def run_instagram_incremental():
"""Run Instagram incremental update with cumulative markdown."""
logger.info("=" * 60)
logger.info("INSTAGRAM INCREMENTAL UPDATE (CUMULATIVE)")
logger.info("=" * 60)
if not os.getenv('INSTAGRAM_USERNAME'):
logger.warning("Instagram not configured")
return False, 0, None
config = ScraperConfig(
source_name='Instagram',
brand_name='hkia',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = InstagramScraperCumulative(config)
return scraper.run_incremental(max_posts=50) # Check for 50 new posts
except Exception as e:
logger.error(f"Instagram error: {e}")
return False, 0, None
def run_youtube_incremental():
"""Run YouTube incremental update with thumbnails."""
logger.info("=" * 60)
logger.info("YOUTUBE INCREMENTAL UPDATE")
logger.info("=" * 60)
config = ScraperConfig(
source_name='YouTube',
brand_name='hkia',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = YouTubeAPIScraperWithThumbnails(config)
videos = scraper.fetch_content(max_posts=20) # Check for 20 new videos
if videos:
manager = CumulativeMarkdownManager(config)
output_file = manager.update_cumulative_file(videos, 'YouTube')
thumb_count = sum(1 for v in videos if v.get('local_thumbnail'))
logger.info(f"✅ YouTube: {len(videos)} videos, {thumb_count} thumbnails")
return True, len(videos), output_file
else:
logger.info("No new YouTube videos")
return False, 0, None
except Exception as e:
logger.error(f"YouTube error: {e}")
return False, 0, None
def run_podcast_incremental():
"""Run Podcast incremental update with thumbnails."""
logger.info("=" * 60)
logger.info("PODCAST INCREMENTAL UPDATE")
logger.info("=" * 60)
if not os.getenv('PODCAST_RSS_URL'):
logger.warning("Podcast not configured")
return False, 0, None
config = ScraperConfig(
source_name='Podcast',
brand_name='hkia',
data_dir=Path('data'),
logs_dir=Path('logs'),
timezone='America/Halifax'
)
try:
scraper = RSSScraperPodcastWithImages(config)
items = scraper.fetch_content(max_items=10) # Check for 10 new episodes
if items:
manager = CumulativeMarkdownManager(config)
output_file = manager.update_cumulative_file(items, 'Podcast')
thumb_count = sum(1 for item in items if item.get('local_thumbnail'))
logger.info(f"✅ Podcast: {len(items)} episodes, {thumb_count} thumbnails")
return True, len(items), output_file
else:
logger.info("No new podcast episodes")
return False, 0, None
except Exception as e:
logger.error(f"Podcast error: {e}")
return False, 0, None
def sync_to_nas_with_images():
"""Sync markdown files AND images to NAS."""
logger.info("\n" + "=" * 60)
logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES")
logger.info("=" * 60)
nas_base = Path('/mnt/nas/hkia')
try:
# Sync markdown files
local_current = Path('data/markdown_current')
nas_current = nas_base / 'markdown_current'
if local_current.exists() and any(local_current.glob('*.md')):
nas_current.mkdir(parents=True, exist_ok=True)
cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
str(local_current) + '/', str(nas_current) + '/']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"✅ Markdown files synced to NAS")
else:
logger.warning(f"Markdown sync warning: {result.stderr}")
# Sync media files
local_media = Path('data/media')
nas_media = nas_base / 'media'
if local_media.exists():
nas_media.mkdir(parents=True, exist_ok=True)
cmd = ['rsync', '-av',
'--include=*/',
'--include=*.jpg', '--include=*.jpeg',
'--include=*.png', '--include=*.gif',
'--exclude=*',
str(local_media) + '/', str(nas_media) + '/']
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"✅ Media files synced to NAS")
except Exception as e:
logger.error(f"Failed to sync to NAS: {e}")
def main():
"""Main production run with cumulative updates and images."""
logger.info("=" * 70)
logger.info("HKIA - CUMULATIVE PRODUCTION")
logger.info("With Image Downloads and Cumulative Markdown")
logger.info("=" * 70)
atlantic_tz = pytz.timezone('America/Halifax')
start_time = datetime.now(atlantic_tz)
logger.info(f"Started at: {start_time.isoformat()}")
# Track results
results = {}
# Run incremental updates
success, count, file = run_instagram_incremental()
results['Instagram'] = {'success': success, 'count': count, 'file': file}
time.sleep(2)
success, count, file = run_youtube_incremental()
results['YouTube'] = {'success': success, 'count': count, 'file': file}
time.sleep(2)
success, count, file = run_podcast_incremental()
results['Podcast'] = {'success': success, 'count': count, 'file': file}
# Also run MailChimp (already has cumulative support)
# ... (add MailChimp, WordPress, TikTok as needed)
# Sync to NAS
sync_to_nas_with_images()
# Summary
logger.info("\n" + "=" * 60)
logger.info("PRODUCTION SUMMARY")
logger.info("=" * 60)
for source, result in results.items():
if result['success']:
logger.info(f"{source}: {result['count']} items")
else:
logger.info(f" {source}: No new items")
logger.info("=" * 60)
if __name__ == "__main__":
main()