Major enhancements to HKIA content analysis system: CRITICAL FIXES: • Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly) • YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment) • Instagram: 7.37% average engagement rate across 20 posts • High performer detection operational (1 YouTube + 20 Instagram above thresholds) CONTENT ANALYSIS SYSTEM: • Add Claude Haiku analyzer for HVAC content classification • Add engagement analyzer with source-specific algorithms • Add keyword extractor with 100+ HVAC-specific terms • Add intelligence aggregator for daily JSON reports • Add comprehensive unit test suite (73 tests, 90% coverage target) ARCHITECTURE: • Extend BaseScraper with optional AI analysis capabilities • Add content analysis orchestrator with CLI interface • Add competitive intelligence module structure • Maintain backward compatibility with existing scrapers INTELLIGENCE FEATURES: • Daily intelligence reports with strategic insights • Trending keyword analysis (813 refrigeration, 701 service mentions) • Content opportunity identification • Multi-source engagement benchmarking • HVAC-specific topic and product categorization PRODUCTION READY: • Claude Haiku API integration validated ($15-25/month estimated) • Graceful degradation when API unavailable • Comprehensive logging and error handling • State management for analytics tracking Ready for Phase 2: Competitive Intelligence Infrastructure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			380 lines
		
	
	
		
			No EOL
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			380 lines
		
	
	
		
			No EOL
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| Comprehensive Unit Tests for Engagement Analyzer
 | |
| 
 | |
| Tests engagement metrics calculation, trending content identification,
 | |
| virality scoring, and source-specific analysis.
 | |
| """
 | |
| 
 | |
| import pytest
 | |
| from unittest.mock import Mock, patch
 | |
| from datetime import datetime, timedelta
 | |
| from pathlib import Path
 | |
| import sys
 | |
| 
 | |
| # Add src to path for imports
 | |
| if str(Path(__file__).parent.parent) not in sys.path:
 | |
|     sys.path.insert(0, str(Path(__file__).parent.parent))
 | |
| 
 | |
| from src.content_analysis.engagement_analyzer import (
 | |
|     EngagementAnalyzer, 
 | |
|     EngagementMetrics,
 | |
|     TrendingContent
 | |
| )
 | |
| 
 | |
| 
 | |
| class TestEngagementAnalyzer:
 | |
|     """Test suite for EngagementAnalyzer"""
 | |
|     
 | |
|     @pytest.fixture
 | |
|     def analyzer(self):
 | |
|         """Create engagement analyzer instance"""
 | |
|         return EngagementAnalyzer()
 | |
|     
 | |
|     @pytest.fixture
 | |
|     def sample_youtube_items(self):
 | |
|         """Sample YouTube content items with engagement data"""
 | |
|         return [
 | |
|             {
 | |
|                 'id': 'video1',
 | |
|                 'title': 'HVAC Troubleshooting Guide',
 | |
|                 'source': 'youtube',
 | |
|                 'views': 10000,
 | |
|                 'likes': 500,
 | |
|                 'comments': 50,
 | |
|                 'upload_date': '2025-08-27'
 | |
|             },
 | |
|             {
 | |
|                 'id': 'video2', 
 | |
|                 'title': 'Heat Pump Installation',
 | |
|                 'source': 'youtube',
 | |
|                 'views': 5000,
 | |
|                 'likes': 200,
 | |
|                 'comments': 20,
 | |
|                 'upload_date': '2025-08-26'
 | |
|             },
 | |
|             {
 | |
|                 'id': 'video3',
 | |
|                 'title': 'AC Repair Tips',
 | |
|                 'source': 'youtube', 
 | |
|                 'views': 1000,
 | |
|                 'likes': 30,
 | |
|                 'comments': 5,
 | |
|                 'upload_date': '2025-08-25'
 | |
|             }
 | |
|         ]
 | |
|     
 | |
|     @pytest.fixture
 | |
|     def sample_instagram_items(self):
 | |
|         """Sample Instagram content items"""
 | |
|         return [
 | |
|             {
 | |
|                 'id': 'post1',
 | |
|                 'title': 'HVAC tools showcase',
 | |
|                 'source': 'instagram',
 | |
|                 'likes': 150,
 | |
|                 'comments': 25,
 | |
|                 'upload_date': '2025-08-27'
 | |
|             },
 | |
|             {
 | |
|                 'id': 'post2',
 | |
|                 'title': 'Before and after AC install',
 | |
|                 'source': 'instagram', 
 | |
|                 'likes': 80,
 | |
|                 'comments': 10,
 | |
|                 'upload_date': '2025-08-26'
 | |
|             }
 | |
|         ]
 | |
| 
 | |
|     def test_calculate_engagement_rate_youtube(self, analyzer):
 | |
|         """Test engagement rate calculation for YouTube content"""
 | |
|         
 | |
|         # Test normal case
 | |
|         item = {'views': 1000, 'likes': 50, 'comments': 10}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'youtube')
 | |
|         assert rate == 0.06  # (50 + 10) / 1000
 | |
|         
 | |
|         # Test zero views
 | |
|         item = {'views': 0, 'likes': 50, 'comments': 10}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'youtube')
 | |
|         assert rate == 0
 | |
|         
 | |
|         # Test missing engagement data
 | |
|         item = {'views': 1000}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'youtube')
 | |
|         assert rate == 0
 | |
| 
 | |
|     def test_calculate_engagement_rate_instagram(self, analyzer):
 | |
|         """Test engagement rate calculation for Instagram content"""
 | |
|         
 | |
|         # Test with views, likes and comments (preferred method)
 | |
|         item = {'views': 1000, 'likes': 100, 'comments': 20}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'instagram')
 | |
|         # Should use (likes + comments) / views: (100 + 20) / 1000 = 0.12
 | |
|         assert rate == 0.12
 | |
|         
 | |
|         # Test with likes and comments but no views (fallback)
 | |
|         item = {'likes': 100, 'comments': 20}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'instagram')
 | |
|         # Should use comments/likes fallback: 20/100 = 0.2
 | |
|         assert rate == 0.2
 | |
|         
 | |
|         # Test with only comments (no likes, no views)
 | |
|         item = {'comments': 10}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'instagram')
 | |
|         # Should return 0 as there are no likes to calculate fallback
 | |
|         assert rate == 0.0
 | |
| 
 | |
|     def test_get_total_engagement(self, analyzer):
 | |
|         """Test total engagement calculation"""
 | |
|         
 | |
|         # Test YouTube (likes + comments)
 | |
|         item = {'likes': 50, 'comments': 10}
 | |
|         total = analyzer._get_total_engagement(item, 'youtube')
 | |
|         assert total == 60
 | |
|         
 | |
|         # Test Instagram (likes + comments) 
 | |
|         item = {'likes': 100, 'comments': 25}
 | |
|         total = analyzer._get_total_engagement(item, 'instagram')
 | |
|         assert total == 125
 | |
|         
 | |
|         # Test missing data
 | |
|         item = {}
 | |
|         total = analyzer._get_total_engagement(item, 'youtube')
 | |
|         assert total == 0
 | |
| 
 | |
|     def test_analyze_source_engagement_youtube(self, analyzer, sample_youtube_items):
 | |
|         """Test source engagement analysis for YouTube"""
 | |
|         
 | |
|         result = analyzer.analyze_source_engagement(sample_youtube_items, 'youtube')
 | |
|         
 | |
|         # Verify structure
 | |
|         assert 'total_items' in result
 | |
|         assert 'avg_engagement_rate' in result
 | |
|         assert 'median_engagement_rate' in result
 | |
|         assert 'total_engagement' in result
 | |
|         assert 'trending_count' in result
 | |
|         assert 'high_performers' in result
 | |
|         assert 'trending_content' in result
 | |
|         
 | |
|         # Verify calculations
 | |
|         assert result['total_items'] == 3
 | |
|         assert result['total_engagement'] == 805  # 550 + 220 + 35
 | |
|         
 | |
|         # Check engagement rates are calculated correctly
 | |
|         # video1: (500+50)/10000 = 0.055, video2: (200+20)/5000 = 0.044, video3: (30+5)/1000 = 0.035
 | |
|         expected_avg = (0.055 + 0.044 + 0.035) / 3
 | |
|         assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001
 | |
|         
 | |
|         # Check high performers (threshold 0.05 for YouTube)
 | |
|         assert result['high_performers'] == 1  # Only video1 above 0.05
 | |
| 
 | |
|     def test_analyze_source_engagement_instagram(self, analyzer, sample_instagram_items):
 | |
|         """Test source engagement analysis for Instagram"""
 | |
|         
 | |
|         result = analyzer.analyze_source_engagement(sample_instagram_items, 'instagram')
 | |
|         
 | |
|         assert result['total_items'] == 2
 | |
|         assert result['total_engagement'] == 265  # 175 + 90
 | |
|         
 | |
|         # Instagram uses comments/likes: post1: 25/150=0.167, post2: 10/80=0.125
 | |
|         expected_avg = (0.167 + 0.125) / 2
 | |
|         assert abs(result['avg_engagement_rate'] - expected_avg) < 0.001
 | |
| 
 | |
|     def test_identify_trending_content(self, analyzer, sample_youtube_items):
 | |
|         """Test trending content identification"""
 | |
|         
 | |
|         trending = analyzer.identify_trending_content(sample_youtube_items, 'youtube')
 | |
|         
 | |
|         # Should identify high-engagement content
 | |
|         assert len(trending) > 0
 | |
|         
 | |
|         # Check trending content structure
 | |
|         if trending:
 | |
|             item = trending[0]
 | |
|             assert 'content_id' in item
 | |
|             assert 'source' in item
 | |
|             assert 'title' in item
 | |
|             assert 'engagement_score' in item
 | |
|             assert 'trend_type' in item
 | |
| 
 | |
|     def test_calculate_virality_score(self, analyzer):
 | |
|         """Test virality score calculation"""
 | |
|         
 | |
|         # High engagement, recent content
 | |
|         item = {
 | |
|             'views': 10000,
 | |
|             'likes': 800, 
 | |
|             'comments': 200,
 | |
|             'upload_date': '2025-08-27'
 | |
|         }
 | |
|         score = analyzer._calculate_virality_score(item, 'youtube')
 | |
|         assert score > 0
 | |
|         
 | |
|         # Low engagement content
 | |
|         item = {
 | |
|             'views': 100,
 | |
|             'likes': 5,
 | |
|             'comments': 1, 
 | |
|             'upload_date': '2025-08-27'
 | |
|         }
 | |
|         score = analyzer._calculate_virality_score(item, 'youtube')
 | |
|         assert score >= 0
 | |
| 
 | |
|     def test_get_engagement_velocity(self, analyzer):
 | |
|         """Test engagement velocity calculation"""
 | |
|         
 | |
|         # Recent high-engagement content
 | |
|         item = {
 | |
|             'views': 5000,
 | |
|             'upload_date': '2025-08-27'
 | |
|         }
 | |
|         
 | |
|         with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
 | |
|             mock_datetime.now.return_value = datetime(2025, 8, 28)
 | |
|             mock_datetime.strptime = datetime.strptime
 | |
|             
 | |
|             velocity = analyzer._get_engagement_velocity(item)
 | |
|             assert velocity == 5000  # 5000 views / 1 day
 | |
|         
 | |
|         # Older content
 | |
|         item = {
 | |
|             'views': 1000,
 | |
|             'upload_date': '2025-08-25'
 | |
|         }
 | |
|         
 | |
|         with patch('src.content_analysis.engagement_analyzer.datetime') as mock_datetime:
 | |
|             mock_datetime.now.return_value = datetime(2025, 8, 28)
 | |
|             mock_datetime.strptime = datetime.strptime
 | |
|             
 | |
|             velocity = analyzer._get_engagement_velocity(item)
 | |
|             assert velocity == 333.33  # 1000 views / 3 days (rounded)
 | |
| 
 | |
|     def test_empty_content_list(self, analyzer):
 | |
|         """Test handling of empty content lists"""
 | |
|         
 | |
|         result = analyzer.analyze_source_engagement([], 'youtube')
 | |
|         
 | |
|         assert result['total_items'] == 0
 | |
|         assert result['avg_engagement_rate'] == 0
 | |
|         assert result['median_engagement_rate'] == 0
 | |
|         assert result['total_engagement'] == 0
 | |
|         assert result['trending_count'] == 0
 | |
|         assert result['high_performers'] == 0
 | |
|         assert result['trending_content'] == []
 | |
| 
 | |
|     def test_missing_engagement_data(self, analyzer):
 | |
|         """Test handling of content with missing engagement data"""
 | |
|         
 | |
|         items = [
 | |
|             {'id': 'test1', 'title': 'Test', 'source': 'youtube'},  # No engagement data
 | |
|             {'id': 'test2', 'title': 'Test 2', 'source': 'youtube', 'views': 0}  # Zero views
 | |
|         ]
 | |
|         
 | |
|         result = analyzer.analyze_source_engagement(items, 'youtube')
 | |
|         
 | |
|         assert result['total_items'] == 2
 | |
|         assert result['avg_engagement_rate'] == 0
 | |
|         assert result['total_engagement'] == 0
 | |
| 
 | |
|     def test_engagement_thresholds_configuration(self, analyzer):
 | |
|         """Test engagement threshold configuration for different sources"""
 | |
|         
 | |
|         # Check YouTube thresholds
 | |
|         youtube_thresholds = analyzer.engagement_thresholds['youtube']
 | |
|         assert 'high_engagement_rate' in youtube_thresholds
 | |
|         assert 'viral_threshold' in youtube_thresholds
 | |
|         assert 'view_velocity_threshold' in youtube_thresholds
 | |
|         
 | |
|         # Check Instagram thresholds  
 | |
|         instagram_thresholds = analyzer.engagement_thresholds['instagram']
 | |
|         assert 'high_engagement_rate' in instagram_thresholds
 | |
|         assert 'viral_threshold' in instagram_thresholds
 | |
| 
 | |
|     def test_wordpress_engagement_analysis(self, analyzer):
 | |
|         """Test WordPress content engagement analysis"""
 | |
|         
 | |
|         items = [
 | |
|             {
 | |
|                 'id': 'post1',
 | |
|                 'title': 'HVAC Blog Post',
 | |
|                 'source': 'wordpress',
 | |
|                 'comments': 15,
 | |
|                 'upload_date': '2025-08-27'
 | |
|             }
 | |
|         ]
 | |
|         
 | |
|         result = analyzer.analyze_source_engagement(items, 'wordpress')
 | |
|         assert result['total_items'] == 1
 | |
|         # WordPress uses estimated views from comments
 | |
|         assert result['total_engagement'] == 15
 | |
| 
 | |
|     def test_podcast_engagement_analysis(self, analyzer):
 | |
|         """Test podcast content engagement analysis"""
 | |
|         
 | |
|         items = [
 | |
|             {
 | |
|                 'id': 'episode1',
 | |
|                 'title': 'HVAC Podcast Episode',
 | |
|                 'source': 'podcast',
 | |
|                 'upload_date': '2025-08-27'
 | |
|             }
 | |
|         ]
 | |
|         
 | |
|         result = analyzer.analyze_source_engagement(items, 'podcast')
 | |
|         assert result['total_items'] == 1
 | |
|         # Podcast typically has minimal engagement data
 | |
|         assert result['total_engagement'] == 0
 | |
| 
 | |
|     def test_edge_case_numeric_conversions(self, analyzer):
 | |
|         """Test edge cases in numeric field handling"""
 | |
|         
 | |
|         # Test string numeric values
 | |
|         item = {'views': '1,000', 'likes': '50', 'comments': '10'}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'youtube')
 | |
|         # Should handle string conversion: (50+10)/1000 = 0.06
 | |
|         assert rate == 0.06
 | |
|         
 | |
|         # Test None values
 | |
|         item = {'views': None, 'likes': None, 'comments': None}
 | |
|         rate = analyzer._calculate_engagement_rate(item, 'youtube')
 | |
|         assert rate == 0
 | |
| 
 | |
|     def test_trending_content_types(self, analyzer):
 | |
|         """Test different types of trending content classification"""
 | |
|         
 | |
|         # High engagement, recent = viral
 | |
|         viral_item = {
 | |
|             'id': 'viral1',
 | |
|             'title': 'Viral HVAC Video', 
 | |
|             'views': 100000,
 | |
|             'likes': 5000,
 | |
|             'comments': 500,
 | |
|             'upload_date': '2025-08-27'
 | |
|         }
 | |
|         
 | |
|         # Steady growth
 | |
|         steady_item = {
 | |
|             'id': 'steady1',
 | |
|             'title': 'Steady HVAC Content',
 | |
|             'views': 10000, 
 | |
|             'likes': 300,
 | |
|             'comments': 30,
 | |
|             'upload_date': '2025-08-25'
 | |
|         }
 | |
|         
 | |
|         items = [viral_item, steady_item]
 | |
|         trending = analyzer.identify_trending_content(items, 'youtube')
 | |
|         
 | |
|         # Should identify trending content with proper classification
 | |
|         assert len(trending) > 0
 | |
|         
 | |
|         # Check for viral classification
 | |
|         viral_found = any(item.get('trend_type') == 'viral' for item in trending)
 | |
|         # Note: This might not always trigger depending on thresholds, so we test structure
 | |
|         for item in trending:
 | |
|             assert item['trend_type'] in ['viral', 'steady_growth', 'spike']
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     pytest.main([__file__, "-v", "--cov=src.content_analysis.engagement_analyzer", "--cov-report=term-missing"]) |