hvac-kia-content/tests/test_engagement_analyzer.py
Ben Reed ade81beea2 feat: Complete Phase 1 content analysis with engagement parsing fixes
Major enhancements to HKIA content analysis system:

CRITICAL FIXES:
• Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly)
• YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment)
• Instagram: 7.37% average engagement rate across 20 posts
• High performer detection operational (1 YouTube + 20 Instagram above thresholds)

CONTENT ANALYSIS SYSTEM:
• Add Claude Haiku analyzer for HVAC content classification
• Add engagement analyzer with source-specific algorithms
• Add keyword extractor with 100+ HVAC-specific terms
• Add intelligence aggregator for daily JSON reports
• Add comprehensive unit test suite (73 tests, 90% coverage target)

ARCHITECTURE:
• Extend BaseScraper with optional AI analysis capabilities
• Add content analysis orchestrator with CLI interface
• Add competitive intelligence module structure
• Maintain backward compatibility with existing scrapers

INTELLIGENCE FEATURES:
• Daily intelligence reports with strategic insights
• Trending keyword analysis (813 refrigeration, 701 service mentions)
• Content opportunity identification
• Multi-source engagement benchmarking
• HVAC-specific topic and product categorization

PRODUCTION READY:
• Claude Haiku API integration validated ($15-25/month estimated)
• Graceful degradation when API unavailable
• Comprehensive logging and error handling
• State management for analytics tracking

Ready for Phase 2: Competitive Intelligence Infrastructure

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-28 16:40:19 -03:00

380 lines
No EOL
14 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive Unit Tests for Engagement Analyzer
Tests engagement metrics calculation, trending content identification,
virality scoring, and source-specific analysis.
"""
import pytest
from unittest.mock import Mock, patch
from datetime import datetime, timedelta
from pathlib import Path
import sys
# Add src to path for imports
if str(Path(__file__).parent.parent) not in sys.path:
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.content_analysis.engagement_analyzer import (
EngagementAnalyzer,
EngagementMetrics,
TrendingContent
)
class TestEngagementAnalyzer:
"""Test suite for EngagementAnalyzer"""
@pytest.fixture
def analyzer(self):
"""Create engagement analyzer instance"""
return EngagementAnalyzer()
@pytest.fixture
def sample_youtube_items(self):
"""Sample YouTube content items with engagement data"""
return [
{
'id': 'video1',
'title': 'HVAC Troubleshooting Guide',
'source': 'youtube',
'views': 10000,
'likes': 500,
'comments': 50,
'upload_date': '2025-08-27'
},
{
'id': 'video2',
'title': 'Heat Pump Installation',
'source': 'youtube',
'views': 5000,
'likes': 200,
'comments': 20,
'upload_date': '2025-08-26'
},
{
'id': 'video3',
'title': 'AC Repair Tips',
'source': 'youtube',
'views': 1000,
'likes': 30,
'comments': 5,
'upload_date': '2025-08-25'
}
]
@pytest.fixture
def sample_instagram_items(self):
"""Sample Instagram content items"""
return [
{
'id': 'post1',
'title': 'HVAC tools showcase',
'source': 'instagram',
'likes': 150,
'comments': 25,
'upload_date': '2025-08-27'
},
{
'id': 'post2',
'title': 'Before and after AC install',
'source': 'instagram',
'likes': 80,
'comments': 10,
'upload_date': '2025-08-26'
}
]
def test_calculate_engagement_rate_youtube(self, analyzer):
"""Test engagement rate calculation for YouTube content"""
# Test normal case
item = {'views': 1000, 'likes': 50, 'comments': 10}
rate = analyzer._calculate_engagement_rate(item, 'youtube')
assert rate == 0.06 # (50 + 10) / 1000
# Test zero views
item = {'views': 0, 'likes': 50, 'comments': 10}
rate = analyzer._calculate_engagement_rate(item, 'youtube')
assert rate == 0
# Test missing engagement data
item = {'views': 1000}
rate = analyzer._calculate_engagement_rate(item, 'youtube')
assert rate == 0
def test_calculate_engagement_rate_instagram(self, analyzer):
"""Test engagement rate calculation for Instagram content"""
# Test with views, likes and comments (preferred method)
item = {'views': 1000, 'likes': 100, 'comments': 20}
rate = analyzer._calculate_engagement_rate(item, 'instagram')
# Should use (likes + comments) / views: (100 + 20) / 1000 = 0.12
assert rate == 0.12
# Test with likes and comments but no views (fallback)
item = {'likes': 100, 'comments': 20}
rate = analyzer._calculate_engagement_rate(item, 'instagram')
# Should use comments/likes fallback: 20/100 = 0.2
assert rate == 0.2
# Test with only comments (no likes, no views)
item = {'comments': 10}
rate = analyzer._calculate_engagement_rate(item, 'instagram')
# Should return 0 as there are no likes to calculate fallback
assert rate == 0.0
def test_get_total_engagement(self, analyzer):
"""Test total engagement calculation"""
# Test YouTube (likes + comments)
item = {'likes': 50, 'comments': 10}
total = analyzer._get_total_engagement(item, 'youtube')
assert total == 60
# Test Instagram (likes + comments)
item = {'likes': 100, 'comments': 25}
total = analyzer._get_total_engagement(item, 'instagram')
assert total == 125
# Test missing data
item = {}
total = analyzer._get_total_engagement(item, 'youtube')
assert total == 0
def test_analyze_source_engagement_youtube(self, analyzer, sample_youtube_items):
"""Test source engagement analysis for YouTube"""
result = analyzer.analyze_source_engagement(sample_youtube_items, 'youtube')
# Verify structure
assert 'total_items' in result
assert 'avg_engagement_rate' in result
assert 'median_engagement_rate' in result
assert 'total_engagement' in result
assert 'trending_count' in result
assert 'high_performers' in result
assert 'trending_content' in result
# Verify calculations
assert result['total_items'] == 3
assert result['total_engagement'] == 805 # 550 + 220 + 35
# Check engagement rates are calculated correctly
# video1: (500+50)/10000 = 0.055, video2: (200+20)/5000 = 0.044, video3: (30+5)/1000 = 0.035
expected_avg = (0.055 + 0.044 + 0.035) / 3
assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001
# Check high performers (threshold 0.05 for YouTube)
assert result['high_performers'] == 1 # Only video1 above 0.05
def test_analyze_source_engagement_instagram(self, analyzer, sample_instagram_items):
"""Test source engagement analysis for Instagram"""
result = analyzer.analyze_source_engagement(sample_instagram_items, 'instagram')
assert result['total_items'] == 2
assert result['total_engagement'] == 265 # 175 + 90
# Instagram uses comments/likes: post1: 25/150=0.167, post2: 10/80=0.125
expected_avg = (0.167 + 0.125) / 2
assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001
def test_identify_trending_content(self, analyzer, sample_youtube_items):
"""Test trending content identification"""
trending = analyzer.identify_trending_content(sample_youtube_items, 'youtube')
# Should identify high-engagement content
assert len(trending) > 0
# Check trending content structure
if trending:
item = trending[0]
assert 'content_id' in item
assert 'source' in item
assert 'title' in item
assert 'engagement_score' in item
assert 'trend_type' in item
def test_calculate_virality_score(self, analyzer):
"""Test virality score calculation"""
# High engagement, recent content
item = {
'views': 10000,
'likes': 800,
'comments': 200,
'upload_date': '2025-08-27'
}
score = analyzer._calculate_virality_score(item, 'youtube')
assert score > 0
# Low engagement content
item = {
'views': 100,
'likes': 5,
'comments': 1,
'upload_date': '2025-08-27'
}
score = analyzer._calculate_virality_score(item, 'youtube')
assert score >= 0
def test_get_engagement_velocity(self, analyzer):
"""Test engagement velocity calculation"""
# Recent high-engagement content
item = {
'views': 5000,
'upload_date': '2025-08-27'
}
with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
mock_datetime.now.return_value = datetime(2025, 8, 28)
mock_datetime.strptime = datetime.strptime
velocity = analyzer._get_engagement_velocity(item)
assert velocity == 5000 # 5000 views / 1 day
# Older content
item = {
'views': 1000,
'upload_date': '2025-08-25'
}
with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
mock_datetime.now.return_value = datetime(2025, 8, 28)
mock_datetime.strptime = datetime.strptime
velocity = analyzer._get_engagement_velocity(item)
assert velocity == 333.33 # 1000 views / 3 days (rounded)
def test_empty_content_list(self, analyzer):
"""Test handling of empty content lists"""
result = analyzer.analyze_source_engagement([], 'youtube')
assert result['total_items'] == 0
assert result['avg_engagement_rate'] == 0
assert result['median_engagement_rate'] == 0
assert result['total_engagement'] == 0
assert result['trending_count'] == 0
assert result['high_performers'] == 0
assert result['trending_content'] == []
def test_missing_engagement_data(self, analyzer):
"""Test handling of content with missing engagement data"""
items = [
{'id': 'test1', 'title': 'Test', 'source': 'youtube'}, # No engagement data
{'id': 'test2', 'title': 'Test 2', 'source': 'youtube', 'views': 0} # Zero views
]
result = analyzer.analyze_source_engagement(items, 'youtube')
assert result['total_items'] == 2
assert result['avg_engagement_rate'] == 0
assert result['total_engagement'] == 0
def test_engagement_thresholds_configuration(self, analyzer):
"""Test engagement threshold configuration for different sources"""
# Check YouTube thresholds
youtube_thresholds = analyzer.engagement_thresholds['youtube']
assert 'high_engagement_rate' in youtube_thresholds
assert 'viral_threshold' in youtube_thresholds
assert 'view_velocity_threshold' in youtube_thresholds
# Check Instagram thresholds
instagram_thresholds = analyzer.engagement_thresholds['instagram']
assert 'high_engagement_rate' in instagram_thresholds
assert 'viral_threshold' in instagram_thresholds
def test_wordpress_engagement_analysis(self, analyzer):
"""Test WordPress content engagement analysis"""
items = [
{
'id': 'post1',
'title': 'HVAC Blog Post',
'source': 'wordpress',
'comments': 15,
'upload_date': '2025-08-27'
}
]
result = analyzer.analyze_source_engagement(items, 'wordpress')
assert result['total_items'] == 1
# WordPress uses estimated views from comments
assert result['total_engagement'] == 15
def test_podcast_engagement_analysis(self, analyzer):
"""Test podcast content engagement analysis"""
items = [
{
'id': 'episode1',
'title': 'HVAC Podcast Episode',
'source': 'podcast',
'upload_date': '2025-08-27'
}
]
result = analyzer.analyze_source_engagement(items, 'podcast')
assert result['total_items'] == 1
# Podcast typically has minimal engagement data
assert result['total_engagement'] == 0
def test_edge_case_numeric_conversions(self, analyzer):
"""Test edge cases in numeric field handling"""
# Test string numeric values
item = {'views': '1,000', 'likes': '50', 'comments': '10'}
rate = analyzer._calculate_engagement_rate(item, 'youtube')
# Should handle string conversion: (50+10)/1000 = 0.06
assert rate == 0.06
# Test None values
item = {'views': None, 'likes': None, 'comments': None}
rate = analyzer._calculate_engagement_rate(item, 'youtube')
assert rate == 0
def test_trending_content_types(self, analyzer):
"""Test different types of trending content classification"""
# High engagement, recent = viral
viral_item = {
'id': 'viral1',
'title': 'Viral HVAC Video',
'views': 100000,
'likes': 5000,
'comments': 500,
'upload_date': '2025-08-27'
}
# Steady growth
steady_item = {
'id': 'steady1',
'title': 'Steady HVAC Content',
'views': 10000,
'likes': 300,
'comments': 30,
'upload_date': '2025-08-25'
}
items = [viral_item, steady_item]
trending = analyzer.identify_trending_content(items, 'youtube')
# Should identify trending content with proper classification
assert len(trending) > 0
# Check for viral classification
viral_found = any(item.get('trend_type') == 'viral' for item in trending)
# Note: This might not always trigger depending on thresholds, so we test structure
for item in trending:
assert item['trend_type'] in ['viral', 'steady_growth', 'spike']
if __name__ == "__main__":
pytest.main([__file__, "-v", "--cov=src.content_analysis.engagement_analyzer", "--cov-report=term-missing"])