Major Updates:
- Added image downloading for Instagram, YouTube, and Podcast scrapers
- Implemented cumulative markdown system for maintaining single source-of-truth files
- Deployed production services with automatic NAS sync for images
- Standardized file naming conventions per project specification
New Features:
- Instagram: Downloads all post images, carousel images, and video thumbnails
- YouTube: Downloads video thumbnails (highest quality available)
- Podcast: Downloads episode artwork/thumbnails
- Consistent image naming: {source}_{item_id}_{type}.{ext}
- Cumulative markdown updates to prevent file proliferation
- Automatic media sync to NAS at /mnt/nas/hvacknowitall/media/
Production Deployment:
- New systemd services: hvac-content-images-8am and hvac-content-images-12pm
- Runs twice daily at 8 AM and 12 PM Atlantic time
- Comprehensive rsync for both markdown and media files
File Structure Compliance:
- Renamed Instagram backlog to spec-compliant format
- Archived legacy directory structures
- Ensured all new files follow <brandName>_<source>_<dateTime>.md format
Testing:
- Successfully captured Instagram posts 1-1000 with images
- Launched next batch (posts 1001-2000) currently in progress
- Verified thumbnail downloads for YouTube and Podcast content
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
344 lines
No EOL
12 KiB
Python
344 lines
No EOL
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Production script with comprehensive image downloading for all sources.
|
|
Downloads thumbnails and images from Instagram, YouTube, and Podcasts.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.youtube_api_scraper_with_thumbnails import YouTubeAPIScraperWithThumbnails
|
|
from src.mailchimp_api_scraper_v2 import MailChimpAPIScraper
|
|
from src.instagram_scraper_with_images import InstagramScraperWithImages
|
|
from src.rss_scraper_with_images import RSSScraperPodcastWithImages
|
|
from src.wordpress_scraper import WordPressScraper
|
|
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
|
|
from src.base_scraper import ScraperConfig
|
|
from src.cumulative_markdown_manager import CumulativeMarkdownManager
|
|
from datetime import datetime
|
|
import pytz
|
|
import time
|
|
import logging
|
|
import subprocess
|
|
import os
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('logs/production_with_images.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger('production_with_images')
|
|
|
|
|
|
def get_atlantic_timestamp() -> str:
|
|
"""Get current timestamp in Atlantic timezone for file naming."""
|
|
tz = pytz.timezone('America/Halifax')
|
|
return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S')
|
|
|
|
|
|
def run_youtube_with_thumbnails():
|
|
"""Run YouTube API scraper with thumbnail downloads."""
|
|
logger.info("=" * 60)
|
|
logger.info("YOUTUBE API SCRAPER WITH THUMBNAILS")
|
|
logger.info("=" * 60)
|
|
|
|
timestamp = get_atlantic_timestamp()
|
|
|
|
config = ScraperConfig(
|
|
source_name='YouTube',
|
|
brand_name='hvacnkowitall',
|
|
data_dir=Path('data'),
|
|
logs_dir=Path('logs'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
try:
|
|
scraper = YouTubeAPIScraperWithThumbnails(config)
|
|
|
|
# Fetch videos with thumbnails
|
|
logger.info("Fetching YouTube videos and downloading thumbnails...")
|
|
videos = scraper.fetch_content(max_posts=100) # Limit for testing
|
|
|
|
if videos:
|
|
# Process cumulative markdown
|
|
manager = CumulativeMarkdownManager(config)
|
|
output_file = manager.update_cumulative_file(videos, 'YouTube')
|
|
|
|
logger.info(f"✅ YouTube completed: {len(videos)} videos")
|
|
logger.info(f" Output: {output_file}")
|
|
|
|
# Count downloaded thumbnails
|
|
thumb_count = sum(1 for v in videos if v.get('local_thumbnail'))
|
|
logger.info(f" Thumbnails downloaded: {thumb_count}")
|
|
|
|
return True, len(videos), output_file
|
|
else:
|
|
logger.warning("No YouTube videos fetched")
|
|
return False, 0, None
|
|
|
|
except Exception as e:
|
|
logger.error(f"YouTube scraper error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False, 0, None
|
|
|
|
|
|
def run_instagram_with_images():
|
|
"""Run Instagram scraper with image downloads."""
|
|
logger.info("=" * 60)
|
|
logger.info("INSTAGRAM SCRAPER WITH IMAGES")
|
|
logger.info("=" * 60)
|
|
|
|
if not os.getenv('INSTAGRAM_USERNAME'):
|
|
logger.warning("Instagram not configured (INSTAGRAM_USERNAME missing)")
|
|
return False, 0, None
|
|
|
|
timestamp = get_atlantic_timestamp()
|
|
|
|
config = ScraperConfig(
|
|
source_name='Instagram',
|
|
brand_name='hvacnkowitall',
|
|
data_dir=Path('data'),
|
|
logs_dir=Path('logs'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
try:
|
|
scraper = InstagramScraperWithImages(config)
|
|
|
|
# Fetch posts with images (limited for testing)
|
|
logger.info("Fetching Instagram posts and downloading images...")
|
|
items = scraper.fetch_content(max_posts=20) # Start with 20 for testing
|
|
|
|
if items:
|
|
# Process cumulative markdown
|
|
manager = CumulativeMarkdownManager(config)
|
|
output_file = manager.update_cumulative_file(items, 'Instagram')
|
|
|
|
logger.info(f"✅ Instagram completed: {len(items)} posts")
|
|
logger.info(f" Output: {output_file}")
|
|
|
|
# Count downloaded images
|
|
img_count = sum(len(item.get('local_images', [])) for item in items)
|
|
logger.info(f" Images downloaded: {img_count}")
|
|
|
|
return True, len(items), output_file
|
|
else:
|
|
logger.warning("No Instagram posts fetched")
|
|
return False, 0, None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Instagram scraper error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False, 0, None
|
|
|
|
|
|
def run_podcast_with_thumbnails():
|
|
"""Run Podcast RSS scraper with thumbnail downloads."""
|
|
logger.info("=" * 60)
|
|
logger.info("PODCAST RSS SCRAPER WITH THUMBNAILS")
|
|
logger.info("=" * 60)
|
|
|
|
if not os.getenv('PODCAST_RSS_URL'):
|
|
logger.warning("Podcast not configured (PODCAST_RSS_URL missing)")
|
|
return False, 0, None
|
|
|
|
timestamp = get_atlantic_timestamp()
|
|
|
|
config = ScraperConfig(
|
|
source_name='Podcast',
|
|
brand_name='hvacnkowitall',
|
|
data_dir=Path('data'),
|
|
logs_dir=Path('logs'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
try:
|
|
scraper = RSSScraperPodcastWithImages(config)
|
|
|
|
# Fetch episodes with thumbnails
|
|
logger.info("Fetching podcast episodes and downloading thumbnails...")
|
|
items = scraper.fetch_content(max_items=50) # Limit for testing
|
|
|
|
if items:
|
|
# Process cumulative markdown
|
|
manager = CumulativeMarkdownManager(config)
|
|
output_file = manager.update_cumulative_file(items, 'Podcast')
|
|
|
|
logger.info(f"✅ Podcast completed: {len(items)} episodes")
|
|
logger.info(f" Output: {output_file}")
|
|
|
|
# Count downloaded thumbnails
|
|
thumb_count = sum(1 for item in items if item.get('local_thumbnail'))
|
|
logger.info(f" Thumbnails downloaded: {thumb_count}")
|
|
|
|
return True, len(items), output_file
|
|
else:
|
|
logger.warning("No podcast episodes fetched")
|
|
return False, 0, None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Podcast scraper error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False, 0, None
|
|
|
|
|
|
def sync_to_nas_with_images():
|
|
"""Sync markdown files AND images to NAS."""
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("SYNCING TO NAS - MARKDOWN AND IMAGES")
|
|
logger.info("=" * 60)
|
|
|
|
nas_base = Path('/mnt/nas/hvacknowitall')
|
|
|
|
try:
|
|
# Sync markdown files
|
|
local_current = Path('data/markdown_current')
|
|
nas_current = nas_base / 'markdown_current'
|
|
|
|
if local_current.exists() and any(local_current.glob('*.md')):
|
|
nas_current.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Sync markdown files
|
|
cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
|
|
str(local_current) + '/', str(nas_current) + '/']
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
logger.info(f"✅ Markdown files synced to NAS: {nas_current}")
|
|
md_count = len(list(nas_current.glob('*.md')))
|
|
logger.info(f" Total markdown files: {md_count}")
|
|
else:
|
|
logger.warning(f"Markdown sync warning: {result.stderr}")
|
|
|
|
# Sync media files
|
|
local_media = Path('data/media')
|
|
nas_media = nas_base / 'media'
|
|
|
|
if local_media.exists():
|
|
nas_media.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Sync all image files (jpg, jpeg, png, gif)
|
|
cmd = ['rsync', '-av',
|
|
'--include=*/',
|
|
'--include=*.jpg', '--include=*.jpeg',
|
|
'--include=*.png', '--include=*.gif',
|
|
'--exclude=*',
|
|
str(local_media) + '/', str(nas_media) + '/']
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
logger.info(f"✅ Media files synced to NAS: {nas_media}")
|
|
|
|
# Count images per source
|
|
for source_dir in nas_media.glob('*'):
|
|
if source_dir.is_dir():
|
|
img_count = len(list(source_dir.glob('*.jpg'))) + \
|
|
len(list(source_dir.glob('*.jpeg'))) + \
|
|
len(list(source_dir.glob('*.png'))) + \
|
|
len(list(source_dir.glob('*.gif')))
|
|
if img_count > 0:
|
|
logger.info(f" {source_dir.name}: {img_count} images")
|
|
else:
|
|
logger.warning(f"Media sync warning: {result.stderr}")
|
|
|
|
# Sync archives
|
|
for source in ['YouTube', 'MailChimp', 'Instagram', 'Podcast', 'WordPress', 'TikTok']:
|
|
local_archive = Path(f'data/markdown_archives/{source}')
|
|
nas_archive = nas_base / f'markdown_archives/{source}'
|
|
|
|
if local_archive.exists() and any(local_archive.glob('*.md')):
|
|
nas_archive.mkdir(parents=True, exist_ok=True)
|
|
|
|
cmd = ['rsync', '-av', '--include=*.md', '--exclude=*',
|
|
str(local_archive) + '/', str(nas_archive) + '/']
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
logger.info(f"✅ {source} archives synced to NAS")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to sync to NAS: {e}")
|
|
|
|
|
|
def main():
|
|
"""Main production run with image downloads."""
|
|
logger.info("=" * 70)
|
|
logger.info("HVAC KNOW IT ALL - PRODUCTION WITH IMAGE DOWNLOADS")
|
|
logger.info("Downloads all thumbnails and images (no videos)")
|
|
logger.info("=" * 70)
|
|
|
|
atlantic_tz = pytz.timezone('America/Halifax')
|
|
start_time = datetime.now(atlantic_tz)
|
|
logger.info(f"Started at: {start_time.isoformat()}")
|
|
|
|
# Track results
|
|
results = {
|
|
'YouTube': {'success': False, 'count': 0, 'file': None},
|
|
'Instagram': {'success': False, 'count': 0, 'file': None},
|
|
'Podcast': {'success': False, 'count': 0, 'file': None}
|
|
}
|
|
|
|
# Run YouTube with thumbnails
|
|
success, count, output_file = run_youtube_with_thumbnails()
|
|
results['YouTube'] = {'success': success, 'count': count, 'file': output_file}
|
|
|
|
# Wait a bit between scrapers
|
|
time.sleep(2)
|
|
|
|
# Run Instagram with images
|
|
success, count, output_file = run_instagram_with_images()
|
|
results['Instagram'] = {'success': success, 'count': count, 'file': output_file}
|
|
|
|
# Wait a bit between scrapers
|
|
time.sleep(2)
|
|
|
|
# Run Podcast with thumbnails
|
|
success, count, output_file = run_podcast_with_thumbnails()
|
|
results['Podcast'] = {'success': success, 'count': count, 'file': output_file}
|
|
|
|
# Sync to NAS including images
|
|
sync_to_nas_with_images()
|
|
|
|
# Summary
|
|
end_time = datetime.now(atlantic_tz)
|
|
duration = (end_time - start_time).total_seconds()
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("PRODUCTION RUN SUMMARY")
|
|
logger.info("=" * 60)
|
|
|
|
for source, result in results.items():
|
|
if result['success']:
|
|
logger.info(f"✅ {source}: {result['count']} items")
|
|
if result['file']:
|
|
logger.info(f" File: {result['file']}")
|
|
else:
|
|
logger.info(f"❌ {source}: Failed")
|
|
|
|
# Count total images downloaded
|
|
media_dir = Path('data/media')
|
|
total_images = 0
|
|
if media_dir.exists():
|
|
for source_dir in media_dir.glob('*'):
|
|
if source_dir.is_dir():
|
|
img_count = len(list(source_dir.glob('*.jpg'))) + \
|
|
len(list(source_dir.glob('*.jpeg'))) + \
|
|
len(list(source_dir.glob('*.png'))) + \
|
|
len(list(source_dir.glob('*.gif')))
|
|
total_images += img_count
|
|
|
|
logger.info(f"\nTotal images downloaded: {total_images}")
|
|
logger.info(f"Duration: {duration:.1f} seconds")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |