Major enhancements to HKIA content analysis system: CRITICAL FIXES: • Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly) • YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment) • Instagram: 7.37% average engagement rate across 20 posts • High performer detection operational (1 YouTube + 20 Instagram above thresholds) CONTENT ANALYSIS SYSTEM: • Add Claude Haiku analyzer for HVAC content classification • Add engagement analyzer with source-specific algorithms • Add keyword extractor with 100+ HVAC-specific terms • Add intelligence aggregator for daily JSON reports • Add comprehensive unit test suite (73 tests, 90% coverage target) ARCHITECTURE: • Extend BaseScraper with optional AI analysis capabilities • Add content analysis orchestrator with CLI interface • Add competitive intelligence module structure • Maintain backward compatibility with existing scrapers INTELLIGENCE FEATURES: • Daily intelligence reports with strategic insights • Trending keyword analysis (813 refrigeration, 701 service mentions) • Content opportunity identification • Multi-source engagement benchmarking • HVAC-specific topic and product categorization PRODUCTION READY: • Claude Haiku API integration validated ($15-25/month estimated) • Graceful degradation when API unavailable • Comprehensive logging and error handling • State management for analytics tracking Ready for Phase 2: Competitive Intelligence Infrastructure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
380 lines
No EOL
14 KiB
Python
380 lines
No EOL
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive Unit Tests for Engagement Analyzer
|
|
|
|
Tests engagement metrics calculation, trending content identification,
|
|
virality scoring, and source-specific analysis.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import Mock, patch
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add src to path for imports
|
|
if str(Path(__file__).parent.parent) not in sys.path:
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.content_analysis.engagement_analyzer import (
|
|
EngagementAnalyzer,
|
|
EngagementMetrics,
|
|
TrendingContent
|
|
)
|
|
|
|
|
|
class TestEngagementAnalyzer:
|
|
"""Test suite for EngagementAnalyzer"""
|
|
|
|
@pytest.fixture
|
|
def analyzer(self):
|
|
"""Create engagement analyzer instance"""
|
|
return EngagementAnalyzer()
|
|
|
|
@pytest.fixture
|
|
def sample_youtube_items(self):
|
|
"""Sample YouTube content items with engagement data"""
|
|
return [
|
|
{
|
|
'id': 'video1',
|
|
'title': 'HVAC Troubleshooting Guide',
|
|
'source': 'youtube',
|
|
'views': 10000,
|
|
'likes': 500,
|
|
'comments': 50,
|
|
'upload_date': '2025-08-27'
|
|
},
|
|
{
|
|
'id': 'video2',
|
|
'title': 'Heat Pump Installation',
|
|
'source': 'youtube',
|
|
'views': 5000,
|
|
'likes': 200,
|
|
'comments': 20,
|
|
'upload_date': '2025-08-26'
|
|
},
|
|
{
|
|
'id': 'video3',
|
|
'title': 'AC Repair Tips',
|
|
'source': 'youtube',
|
|
'views': 1000,
|
|
'likes': 30,
|
|
'comments': 5,
|
|
'upload_date': '2025-08-25'
|
|
}
|
|
]
|
|
|
|
@pytest.fixture
|
|
def sample_instagram_items(self):
|
|
"""Sample Instagram content items"""
|
|
return [
|
|
{
|
|
'id': 'post1',
|
|
'title': 'HVAC tools showcase',
|
|
'source': 'instagram',
|
|
'likes': 150,
|
|
'comments': 25,
|
|
'upload_date': '2025-08-27'
|
|
},
|
|
{
|
|
'id': 'post2',
|
|
'title': 'Before and after AC install',
|
|
'source': 'instagram',
|
|
'likes': 80,
|
|
'comments': 10,
|
|
'upload_date': '2025-08-26'
|
|
}
|
|
]
|
|
|
|
def test_calculate_engagement_rate_youtube(self, analyzer):
|
|
"""Test engagement rate calculation for YouTube content"""
|
|
|
|
# Test normal case
|
|
item = {'views': 1000, 'likes': 50, 'comments': 10}
|
|
rate = analyzer._calculate_engagement_rate(item, 'youtube')
|
|
assert rate == 0.06 # (50 + 10) / 1000
|
|
|
|
# Test zero views
|
|
item = {'views': 0, 'likes': 50, 'comments': 10}
|
|
rate = analyzer._calculate_engagement_rate(item, 'youtube')
|
|
assert rate == 0
|
|
|
|
# Test missing engagement data
|
|
item = {'views': 1000}
|
|
rate = analyzer._calculate_engagement_rate(item, 'youtube')
|
|
assert rate == 0
|
|
|
|
def test_calculate_engagement_rate_instagram(self, analyzer):
|
|
"""Test engagement rate calculation for Instagram content"""
|
|
|
|
# Test with views, likes and comments (preferred method)
|
|
item = {'views': 1000, 'likes': 100, 'comments': 20}
|
|
rate = analyzer._calculate_engagement_rate(item, 'instagram')
|
|
# Should use (likes + comments) / views: (100 + 20) / 1000 = 0.12
|
|
assert rate == 0.12
|
|
|
|
# Test with likes and comments but no views (fallback)
|
|
item = {'likes': 100, 'comments': 20}
|
|
rate = analyzer._calculate_engagement_rate(item, 'instagram')
|
|
# Should use comments/likes fallback: 20/100 = 0.2
|
|
assert rate == 0.2
|
|
|
|
# Test with only comments (no likes, no views)
|
|
item = {'comments': 10}
|
|
rate = analyzer._calculate_engagement_rate(item, 'instagram')
|
|
# Should return 0 as there are no likes to calculate fallback
|
|
assert rate == 0.0
|
|
|
|
def test_get_total_engagement(self, analyzer):
|
|
"""Test total engagement calculation"""
|
|
|
|
# Test YouTube (likes + comments)
|
|
item = {'likes': 50, 'comments': 10}
|
|
total = analyzer._get_total_engagement(item, 'youtube')
|
|
assert total == 60
|
|
|
|
# Test Instagram (likes + comments)
|
|
item = {'likes': 100, 'comments': 25}
|
|
total = analyzer._get_total_engagement(item, 'instagram')
|
|
assert total == 125
|
|
|
|
# Test missing data
|
|
item = {}
|
|
total = analyzer._get_total_engagement(item, 'youtube')
|
|
assert total == 0
|
|
|
|
def test_analyze_source_engagement_youtube(self, analyzer, sample_youtube_items):
|
|
"""Test source engagement analysis for YouTube"""
|
|
|
|
result = analyzer.analyze_source_engagement(sample_youtube_items, 'youtube')
|
|
|
|
# Verify structure
|
|
assert 'total_items' in result
|
|
assert 'avg_engagement_rate' in result
|
|
assert 'median_engagement_rate' in result
|
|
assert 'total_engagement' in result
|
|
assert 'trending_count' in result
|
|
assert 'high_performers' in result
|
|
assert 'trending_content' in result
|
|
|
|
# Verify calculations
|
|
assert result['total_items'] == 3
|
|
assert result['total_engagement'] == 805 # 550 + 220 + 35
|
|
|
|
# Check engagement rates are calculated correctly
|
|
# video1: (500+50)/10000 = 0.055, video2: (200+20)/5000 = 0.044, video3: (30+5)/1000 = 0.035
|
|
expected_avg = (0.055 + 0.044 + 0.035) / 3
|
|
assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001
|
|
|
|
# Check high performers (threshold 0.05 for YouTube)
|
|
assert result['high_performers'] == 1 # Only video1 above 0.05
|
|
|
|
def test_analyze_source_engagement_instagram(self, analyzer, sample_instagram_items):
|
|
"""Test source engagement analysis for Instagram"""
|
|
|
|
result = analyzer.analyze_source_engagement(sample_instagram_items, 'instagram')
|
|
|
|
assert result['total_items'] == 2
|
|
assert result['total_engagement'] == 265 # 175 + 90
|
|
|
|
# Instagram uses comments/likes: post1: 25/150=0.167, post2: 10/80=0.125
|
|
expected_avg = (0.167 + 0.125) / 2
|
|
assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001
|
|
|
|
def test_identify_trending_content(self, analyzer, sample_youtube_items):
|
|
"""Test trending content identification"""
|
|
|
|
trending = analyzer.identify_trending_content(sample_youtube_items, 'youtube')
|
|
|
|
# Should identify high-engagement content
|
|
assert len(trending) > 0
|
|
|
|
# Check trending content structure
|
|
if trending:
|
|
item = trending[0]
|
|
assert 'content_id' in item
|
|
assert 'source' in item
|
|
assert 'title' in item
|
|
assert 'engagement_score' in item
|
|
assert 'trend_type' in item
|
|
|
|
def test_calculate_virality_score(self, analyzer):
|
|
"""Test virality score calculation"""
|
|
|
|
# High engagement, recent content
|
|
item = {
|
|
'views': 10000,
|
|
'likes': 800,
|
|
'comments': 200,
|
|
'upload_date': '2025-08-27'
|
|
}
|
|
score = analyzer._calculate_virality_score(item, 'youtube')
|
|
assert score > 0
|
|
|
|
# Low engagement content
|
|
item = {
|
|
'views': 100,
|
|
'likes': 5,
|
|
'comments': 1,
|
|
'upload_date': '2025-08-27'
|
|
}
|
|
score = analyzer._calculate_virality_score(item, 'youtube')
|
|
assert score >= 0
|
|
|
|
def test_get_engagement_velocity(self, analyzer):
|
|
"""Test engagement velocity calculation"""
|
|
|
|
# Recent high-engagement content
|
|
item = {
|
|
'views': 5000,
|
|
'upload_date': '2025-08-27'
|
|
}
|
|
|
|
with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
|
|
mock_datetime.now.return_value = datetime(2025, 8, 28)
|
|
mock_datetime.strptime = datetime.strptime
|
|
|
|
velocity = analyzer._get_engagement_velocity(item)
|
|
assert velocity == 5000 # 5000 views / 1 day
|
|
|
|
# Older content
|
|
item = {
|
|
'views': 1000,
|
|
'upload_date': '2025-08-25'
|
|
}
|
|
|
|
with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
|
|
mock_datetime.now.return_value = datetime(2025, 8, 28)
|
|
mock_datetime.strptime = datetime.strptime
|
|
|
|
velocity = analyzer._get_engagement_velocity(item)
|
|
assert velocity == 333.33 # 1000 views / 3 days (rounded)
|
|
|
|
def test_empty_content_list(self, analyzer):
|
|
"""Test handling of empty content lists"""
|
|
|
|
result = analyzer.analyze_source_engagement([], 'youtube')
|
|
|
|
assert result['total_items'] == 0
|
|
assert result['avg_engagement_rate'] == 0
|
|
assert result['median_engagement_rate'] == 0
|
|
assert result['total_engagement'] == 0
|
|
assert result['trending_count'] == 0
|
|
assert result['high_performers'] == 0
|
|
assert result['trending_content'] == []
|
|
|
|
def test_missing_engagement_data(self, analyzer):
|
|
"""Test handling of content with missing engagement data"""
|
|
|
|
items = [
|
|
{'id': 'test1', 'title': 'Test', 'source': 'youtube'}, # No engagement data
|
|
{'id': 'test2', 'title': 'Test 2', 'source': 'youtube', 'views': 0} # Zero views
|
|
]
|
|
|
|
result = analyzer.analyze_source_engagement(items, 'youtube')
|
|
|
|
assert result['total_items'] == 2
|
|
assert result['avg_engagement_rate'] == 0
|
|
assert result['total_engagement'] == 0
|
|
|
|
def test_engagement_thresholds_configuration(self, analyzer):
|
|
"""Test engagement threshold configuration for different sources"""
|
|
|
|
# Check YouTube thresholds
|
|
youtube_thresholds = analyzer.engagement_thresholds['youtube']
|
|
assert 'high_engagement_rate' in youtube_thresholds
|
|
assert 'viral_threshold' in youtube_thresholds
|
|
assert 'view_velocity_threshold' in youtube_thresholds
|
|
|
|
# Check Instagram thresholds
|
|
instagram_thresholds = analyzer.engagement_thresholds['instagram']
|
|
assert 'high_engagement_rate' in instagram_thresholds
|
|
assert 'viral_threshold' in instagram_thresholds
|
|
|
|
def test_wordpress_engagement_analysis(self, analyzer):
|
|
"""Test WordPress content engagement analysis"""
|
|
|
|
items = [
|
|
{
|
|
'id': 'post1',
|
|
'title': 'HVAC Blog Post',
|
|
'source': 'wordpress',
|
|
'comments': 15,
|
|
'upload_date': '2025-08-27'
|
|
}
|
|
]
|
|
|
|
result = analyzer.analyze_source_engagement(items, 'wordpress')
|
|
assert result['total_items'] == 1
|
|
# WordPress uses estimated views from comments
|
|
assert result['total_engagement'] == 15
|
|
|
|
def test_podcast_engagement_analysis(self, analyzer):
|
|
"""Test podcast content engagement analysis"""
|
|
|
|
items = [
|
|
{
|
|
'id': 'episode1',
|
|
'title': 'HVAC Podcast Episode',
|
|
'source': 'podcast',
|
|
'upload_date': '2025-08-27'
|
|
}
|
|
]
|
|
|
|
result = analyzer.analyze_source_engagement(items, 'podcast')
|
|
assert result['total_items'] == 1
|
|
# Podcast typically has minimal engagement data
|
|
assert result['total_engagement'] == 0
|
|
|
|
def test_edge_case_numeric_conversions(self, analyzer):
|
|
"""Test edge cases in numeric field handling"""
|
|
|
|
# Test string numeric values
|
|
item = {'views': '1,000', 'likes': '50', 'comments': '10'}
|
|
rate = analyzer._calculate_engagement_rate(item, 'youtube')
|
|
# Should handle string conversion: (50+10)/1000 = 0.06
|
|
assert rate == 0.06
|
|
|
|
# Test None values
|
|
item = {'views': None, 'likes': None, 'comments': None}
|
|
rate = analyzer._calculate_engagement_rate(item, 'youtube')
|
|
assert rate == 0
|
|
|
|
def test_trending_content_types(self, analyzer):
|
|
"""Test different types of trending content classification"""
|
|
|
|
# High engagement, recent = viral
|
|
viral_item = {
|
|
'id': 'viral1',
|
|
'title': 'Viral HVAC Video',
|
|
'views': 100000,
|
|
'likes': 5000,
|
|
'comments': 500,
|
|
'upload_date': '2025-08-27'
|
|
}
|
|
|
|
# Steady growth
|
|
steady_item = {
|
|
'id': 'steady1',
|
|
'title': 'Steady HVAC Content',
|
|
'views': 10000,
|
|
'likes': 300,
|
|
'comments': 30,
|
|
'upload_date': '2025-08-25'
|
|
}
|
|
|
|
items = [viral_item, steady_item]
|
|
trending = analyzer.identify_trending_content(items, 'youtube')
|
|
|
|
# Should identify trending content with proper classification
|
|
assert len(trending) > 0
|
|
|
|
# Check for viral classification
|
|
viral_found = any(item.get('trend_type') == 'viral' for item in trending)
|
|
# Note: This might not always trigger depending on thresholds, so we test structure
|
|
for item in trending:
|
|
assert item['trend_type'] in ['viral', 'steady_growth', 'spike']
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v", "--cov=src.content_analysis.engagement_analyzer", "--cov-report=term-missing"]) |