hvac-kia-content/test_api_scrapers_full.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

162 lines
No EOL
5.9 KiB
Python

#!/usr/bin/env python3
"""
Test full backlog capture with new API scrapers
This will fetch all YouTube videos and MailChimp campaigns using APIs
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.youtube_api_scraper import YouTubeAPIScraper
from src.mailchimp_api_scraper import MailChimpAPIScraper
from src.base_scraper import ScraperConfig
import time
def test_youtube_api_full():
"""Test YouTube API scraper with full channel fetch"""
print("=" * 60)
print("TESTING YOUTUBE API SCRAPER - FULL CHANNEL")
print("=" * 60)
config = ScraperConfig(
source_name='youtube_api',
brand_name='hvacknowitall',
data_dir=Path('data_api_test/youtube'),
logs_dir=Path('logs_api_test/youtube'),
timezone='America/Halifax'
)
scraper = YouTubeAPIScraper(config)
print(f"Fetching all videos from channel...")
start = time.time()
# Fetch all videos (should be ~370)
# With transcripts for top 50 by views
videos = scraper.fetch_content(fetch_transcripts=True)
elapsed = time.time() - start
print(f"\n✅ Fetched {len(videos)} videos in {elapsed:.1f} seconds")
# Show statistics
total_views = sum(v.get('view_count', 0) for v in videos)
total_likes = sum(v.get('like_count', 0) for v in videos)
with_transcripts = sum(1 for v in videos if v.get('transcript'))
print(f"\nStatistics:")
print(f" Total videos: {len(videos)}")
print(f" Total views: {total_views:,}")
print(f" Total likes: {total_likes:,}")
print(f" Videos with transcripts: {with_transcripts}")
print(f" Quota used: {scraper.quota_used}/{scraper.daily_quota_limit} units")
# Show top 5 videos by views
print(f"\nTop 5 videos by views:")
top_videos = sorted(videos, key=lambda x: x.get('view_count', 0), reverse=True)[:5]
for i, video in enumerate(top_videos, 1):
views = video.get('view_count', 0)
title = video.get('title', 'Unknown')[:60]
has_transcript = '' if video.get('transcript') else ''
print(f" {i}. {views:,} views | {title}... | Transcript: {has_transcript}")
# Save markdown
markdown = scraper.format_markdown(videos)
output_file = Path('data_api_test/youtube/youtube_api_full.md')
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(markdown, encoding='utf-8')
print(f"\nMarkdown saved to: {output_file}")
return videos
def test_mailchimp_api_full():
"""Test MailChimp API scraper with full campaign fetch"""
print("\n" + "=" * 60)
print("TESTING MAILCHIMP API SCRAPER - ALL CAMPAIGNS")
print("=" * 60)
config = ScraperConfig(
source_name='mailchimp_api',
brand_name='hvacknowitall',
data_dir=Path('data_api_test/mailchimp'),
logs_dir=Path('logs_api_test/mailchimp'),
timezone='America/Halifax'
)
scraper = MailChimpAPIScraper(config)
print(f"Fetching all campaigns from 'Bi-Weekly Newsletter' folder...")
start = time.time()
# Fetch all campaigns (up to 100)
campaigns = scraper.fetch_content(max_items=100)
elapsed = time.time() - start
print(f"\n✅ Fetched {len(campaigns)} campaigns in {elapsed:.1f} seconds")
if campaigns:
# Show statistics
total_sent = sum(c.get('metrics', {}).get('emails_sent', 0) for c in campaigns)
total_opens = sum(c.get('metrics', {}).get('unique_opens', 0) for c in campaigns)
total_clicks = sum(c.get('metrics', {}).get('unique_clicks', 0) for c in campaigns)
print(f"\nStatistics:")
print(f" Total campaigns: {len(campaigns)}")
print(f" Total emails sent: {total_sent:,}")
print(f" Total unique opens: {total_opens:,}")
print(f" Total unique clicks: {total_clicks:,}")
# Calculate average rates
if campaigns:
avg_open_rate = sum(c.get('metrics', {}).get('open_rate', 0) for c in campaigns) / len(campaigns)
avg_click_rate = sum(c.get('metrics', {}).get('click_rate', 0) for c in campaigns) / len(campaigns)
print(f" Average open rate: {avg_open_rate*100:.1f}%")
print(f" Average click rate: {avg_click_rate*100:.1f}%")
# Show recent campaigns
print(f"\n5 Most Recent Campaigns:")
for i, campaign in enumerate(campaigns[:5], 1):
title = campaign.get('title', 'Unknown')[:50]
send_time = campaign.get('send_time', 'Unknown')[:10]
metrics = campaign.get('metrics', {})
opens = metrics.get('unique_opens', 0)
open_rate = metrics.get('open_rate', 0) * 100
print(f" {i}. {send_time} | {title}... | Opens: {opens} ({open_rate:.1f}%)")
# Save markdown
markdown = scraper.format_markdown(campaigns)
output_file = Path('data_api_test/mailchimp/mailchimp_api_full.md')
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(markdown, encoding='utf-8')
print(f"\nMarkdown saved to: {output_file}")
else:
print("\n⚠️ No campaigns found!")
return campaigns
def main():
"""Run full API scraper tests"""
print("HVAC Know It All - API Scraper Full Test")
print("This will fetch all content using the new API scrapers")
print("-" * 60)
# Test YouTube API
youtube_videos = test_youtube_api_full()
# Test MailChimp API
mailchimp_campaigns = test_mailchimp_api_full()
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"✅ YouTube API: {len(youtube_videos)} videos fetched")
print(f"✅ MailChimp API: {len(mailchimp_campaigns)} campaigns fetched")
print("\nAPI scrapers are working successfully!")
print("Ready for production deployment.")
if __name__ == "__main__":
main()