Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
162 lines
No EOL
5.9 KiB
Python
162 lines
No EOL
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test full backlog capture with new API scrapers
|
|
This will fetch all YouTube videos and MailChimp campaigns using APIs
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.youtube_api_scraper import YouTubeAPIScraper
|
|
from src.mailchimp_api_scraper import MailChimpAPIScraper
|
|
from src.base_scraper import ScraperConfig
|
|
import time
|
|
|
|
def test_youtube_api_full():
|
|
"""Test YouTube API scraper with full channel fetch"""
|
|
print("=" * 60)
|
|
print("TESTING YOUTUBE API SCRAPER - FULL CHANNEL")
|
|
print("=" * 60)
|
|
|
|
config = ScraperConfig(
|
|
source_name='youtube_api',
|
|
brand_name='hvacknowitall',
|
|
data_dir=Path('data_api_test/youtube'),
|
|
logs_dir=Path('logs_api_test/youtube'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
scraper = YouTubeAPIScraper(config)
|
|
|
|
print(f"Fetching all videos from channel...")
|
|
start = time.time()
|
|
|
|
# Fetch all videos (should be ~370)
|
|
# With transcripts for top 50 by views
|
|
videos = scraper.fetch_content(fetch_transcripts=True)
|
|
|
|
elapsed = time.time() - start
|
|
print(f"\n✅ Fetched {len(videos)} videos in {elapsed:.1f} seconds")
|
|
|
|
# Show statistics
|
|
total_views = sum(v.get('view_count', 0) for v in videos)
|
|
total_likes = sum(v.get('like_count', 0) for v in videos)
|
|
with_transcripts = sum(1 for v in videos if v.get('transcript'))
|
|
|
|
print(f"\nStatistics:")
|
|
print(f" Total videos: {len(videos)}")
|
|
print(f" Total views: {total_views:,}")
|
|
print(f" Total likes: {total_likes:,}")
|
|
print(f" Videos with transcripts: {with_transcripts}")
|
|
print(f" Quota used: {scraper.quota_used}/{scraper.daily_quota_limit} units")
|
|
|
|
# Show top 5 videos by views
|
|
print(f"\nTop 5 videos by views:")
|
|
top_videos = sorted(videos, key=lambda x: x.get('view_count', 0), reverse=True)[:5]
|
|
for i, video in enumerate(top_videos, 1):
|
|
views = video.get('view_count', 0)
|
|
title = video.get('title', 'Unknown')[:60]
|
|
has_transcript = '✓' if video.get('transcript') else '✗'
|
|
print(f" {i}. {views:,} views | {title}... | Transcript: {has_transcript}")
|
|
|
|
# Save markdown
|
|
markdown = scraper.format_markdown(videos)
|
|
output_file = Path('data_api_test/youtube/youtube_api_full.md')
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
print(f"\nMarkdown saved to: {output_file}")
|
|
|
|
return videos
|
|
|
|
|
|
def test_mailchimp_api_full():
|
|
"""Test MailChimp API scraper with full campaign fetch"""
|
|
print("\n" + "=" * 60)
|
|
print("TESTING MAILCHIMP API SCRAPER - ALL CAMPAIGNS")
|
|
print("=" * 60)
|
|
|
|
config = ScraperConfig(
|
|
source_name='mailchimp_api',
|
|
brand_name='hvacknowitall',
|
|
data_dir=Path('data_api_test/mailchimp'),
|
|
logs_dir=Path('logs_api_test/mailchimp'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
scraper = MailChimpAPIScraper(config)
|
|
|
|
print(f"Fetching all campaigns from 'Bi-Weekly Newsletter' folder...")
|
|
start = time.time()
|
|
|
|
# Fetch all campaigns (up to 100)
|
|
campaigns = scraper.fetch_content(max_items=100)
|
|
|
|
elapsed = time.time() - start
|
|
print(f"\n✅ Fetched {len(campaigns)} campaigns in {elapsed:.1f} seconds")
|
|
|
|
if campaigns:
|
|
# Show statistics
|
|
total_sent = sum(c.get('metrics', {}).get('emails_sent', 0) for c in campaigns)
|
|
total_opens = sum(c.get('metrics', {}).get('unique_opens', 0) for c in campaigns)
|
|
total_clicks = sum(c.get('metrics', {}).get('unique_clicks', 0) for c in campaigns)
|
|
|
|
print(f"\nStatistics:")
|
|
print(f" Total campaigns: {len(campaigns)}")
|
|
print(f" Total emails sent: {total_sent:,}")
|
|
print(f" Total unique opens: {total_opens:,}")
|
|
print(f" Total unique clicks: {total_clicks:,}")
|
|
|
|
# Calculate average rates
|
|
if campaigns:
|
|
avg_open_rate = sum(c.get('metrics', {}).get('open_rate', 0) for c in campaigns) / len(campaigns)
|
|
avg_click_rate = sum(c.get('metrics', {}).get('click_rate', 0) for c in campaigns) / len(campaigns)
|
|
print(f" Average open rate: {avg_open_rate*100:.1f}%")
|
|
print(f" Average click rate: {avg_click_rate*100:.1f}%")
|
|
|
|
# Show recent campaigns
|
|
print(f"\n5 Most Recent Campaigns:")
|
|
for i, campaign in enumerate(campaigns[:5], 1):
|
|
title = campaign.get('title', 'Unknown')[:50]
|
|
send_time = campaign.get('send_time', 'Unknown')[:10]
|
|
metrics = campaign.get('metrics', {})
|
|
opens = metrics.get('unique_opens', 0)
|
|
open_rate = metrics.get('open_rate', 0) * 100
|
|
print(f" {i}. {send_time} | {title}... | Opens: {opens} ({open_rate:.1f}%)")
|
|
|
|
# Save markdown
|
|
markdown = scraper.format_markdown(campaigns)
|
|
output_file = Path('data_api_test/mailchimp/mailchimp_api_full.md')
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
print(f"\nMarkdown saved to: {output_file}")
|
|
else:
|
|
print("\n⚠️ No campaigns found!")
|
|
|
|
return campaigns
|
|
|
|
|
|
def main():
|
|
"""Run full API scraper tests"""
|
|
print("HVAC Know It All - API Scraper Full Test")
|
|
print("This will fetch all content using the new API scrapers")
|
|
print("-" * 60)
|
|
|
|
# Test YouTube API
|
|
youtube_videos = test_youtube_api_full()
|
|
|
|
# Test MailChimp API
|
|
mailchimp_campaigns = test_mailchimp_api_full()
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"✅ YouTube API: {len(youtube_videos)} videos fetched")
|
|
print(f"✅ MailChimp API: {len(mailchimp_campaigns)} campaigns fetched")
|
|
print("\nAPI scrapers are working successfully!")
|
|
print("Ready for production deployment.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |