hvac-kia-content/tests/test_intelligence_aggregator.py
Ben Reed ade81beea2 feat: Complete Phase 1 content analysis with engagement parsing fixes
Major enhancements to HKIA content analysis system:

CRITICAL FIXES:
• Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly)
• YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment)
• Instagram: 7.37% average engagement rate across 20 posts
• High performer detection operational (1 YouTube + 20 Instagram above thresholds)

CONTENT ANALYSIS SYSTEM:
• Add Claude Haiku analyzer for HVAC content classification
• Add engagement analyzer with source-specific algorithms
• Add keyword extractor with 100+ HVAC-specific terms
• Add intelligence aggregator for daily JSON reports
• Add comprehensive unit test suite (73 tests, 90% coverage target)

ARCHITECTURE:
• Extend BaseScraper with optional AI analysis capabilities
• Add content analysis orchestrator with CLI interface
• Add competitive intelligence module structure
• Maintain backward compatibility with existing scrapers

INTELLIGENCE FEATURES:
• Daily intelligence reports with strategic insights
• Trending keyword analysis (813 refrigeration, 701 service mentions)
• Content opportunity identification
• Multi-source engagement benchmarking
• HVAC-specific topic and product categorization

PRODUCTION READY:
• Claude Haiku API integration validated ($15-25/month estimated)
• Graceful degradation when API unavailable
• Comprehensive logging and error handling
• State management for analytics tracking

Ready for Phase 2: Competitive Intelligence Infrastructure

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-28 16:40:19 -03:00

500 lines
No EOL
18 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive Unit Tests for Intelligence Aggregator
Tests intelligence report generation, markdown parsing,
content analysis coordination, and strategic insights.
"""
import pytest
from unittest.mock import Mock, patch, mock_open
from pathlib import Path
from datetime import datetime, timedelta
import json
import sys
# Add src to path for imports
if str(Path(__file__).parent.parent) not in sys.path:
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.content_analysis.intelligence_aggregator import IntelligenceAggregator
class TestIntelligenceAggregator:
"""Test suite for IntelligenceAggregator"""
@pytest.fixture
def temp_data_dir(self, tmp_path):
"""Create temporary data directory structure"""
data_dir = tmp_path / "data"
data_dir.mkdir()
# Create required subdirectories
(data_dir / "intelligence" / "daily").mkdir(parents=True)
(data_dir / "intelligence" / "weekly").mkdir(parents=True)
(data_dir / "intelligence" / "monthly").mkdir(parents=True)
(data_dir / "markdown_current").mkdir()
return data_dir
@pytest.fixture
def aggregator(self, temp_data_dir):
"""Create intelligence aggregator instance with temp directory"""
return IntelligenceAggregator(temp_data_dir)
@pytest.fixture
def sample_markdown_content(self):
"""Sample markdown content for testing parsing"""
return """# ID: video1
## Title: HVAC Installation Guide
## Type: video
## Author: HVAC Know It All
## Link: https://www.youtube.com/watch?v=video1
## Upload Date: 2025-08-27
## Views: 5000
## Likes: 250
## Comments: 30
## Engagement Rate: 5.6%
## Description:
Learn professional HVAC installation techniques in this comprehensive guide.
# ID: video2
## Title: Heat Pump Maintenance
## Type: video
## Views: 3000
## Likes: 150
## Comments: 20
## Description:
Essential heat pump maintenance procedures for optimal performance.
"""
@pytest.fixture
def sample_content_items(self):
"""Sample content items for testing analysis"""
return [
{
'id': 'item1',
'title': 'HVAC Installation Guide',
'source': 'youtube',
'views': 5000,
'likes': 250,
'comments': 30,
'content': 'Professional HVAC installation techniques, heat pump setup, refrigeration cycle',
'upload_date': '2025-08-27'
},
{
'id': 'item2',
'title': 'AC Troubleshooting',
'source': 'wordpress',
'likes': 45,
'comments': 8,
'content': 'Air conditioning repair, compressor issues, refrigerant leaks',
'upload_date': '2025-08-26'
},
{
'id': 'item3',
'title': 'Smart Thermostat Install',
'source': 'instagram',
'likes': 120,
'comments': 15,
'content': 'Smart thermostat wiring, HVAC controls, energy efficiency',
'upload_date': '2025-08-25'
}
]
def test_initialization(self, temp_data_dir):
"""Test aggregator initialization and directory creation"""
aggregator = IntelligenceAggregator(temp_data_dir)
assert aggregator.data_dir == temp_data_dir
assert aggregator.intelligence_dir == temp_data_dir / "intelligence"
assert aggregator.intelligence_dir.exists()
assert (aggregator.intelligence_dir / "daily").exists()
assert (aggregator.intelligence_dir / "weekly").exists()
assert (aggregator.intelligence_dir / "monthly").exists()
def test_parse_markdown_file(self, aggregator, temp_data_dir, sample_markdown_content):
"""Test markdown file parsing"""
# Create test markdown file
md_file = temp_data_dir / "markdown_current" / "hkia_youtube_test.md"
md_file.write_text(sample_markdown_content, encoding='utf-8')
items = aggregator._parse_markdown_file(md_file)
assert len(items) == 2
# Check first item
item1 = items[0]
assert item1['id'] == 'video1'
assert item1['title'] == 'HVAC Installation Guide'
assert item1['source'] == 'youtube'
assert item1['views'] == 5000
assert item1['likes'] == 250
assert item1['comments'] == 30
# Check second item
item2 = items[1]
assert item2['id'] == 'video2'
assert item2['title'] == 'Heat Pump Maintenance'
assert item2['views'] == 3000
def test_parse_content_item(self, aggregator):
"""Test individual content item parsing"""
item_content = """video1
## Title: Test Video
## Views: 1,500
## Likes: 75
## Comments: 10
## Description:
Test video description here.
"""
item = aggregator._parse_content_item(item_content, "youtube_test")
assert item['id'] == 'video1'
assert item['title'] == 'Test Video'
assert item['views'] == 1500 # Comma should be removed
assert item['likes'] == 75
assert item['comments'] == 10
assert item['source'] == 'youtube'
def test_extract_numeric_fields(self, aggregator):
"""Test numeric field extraction and conversion"""
item = {
'views': '10,000',
'likes': '500',
'comments': '50',
'invalid_number': 'abc'
}
aggregator._extract_numeric_fields(item)
assert item['views'] == 10000
assert item['likes'] == 500
assert item['comments'] == 50
# Invalid numbers should become 0
# Note: 'invalid_number' not in numeric_fields list, so unchanged
def test_extract_source_from_filename(self, aggregator):
"""Test source extraction from filenames"""
assert aggregator._extract_source_from_filename("hkia_youtube_20250827") == "youtube"
assert aggregator._extract_source_from_filename("hkia_instagram_test") == "instagram"
assert aggregator._extract_source_from_filename("hkia_wordpress_latest") == "wordpress"
assert aggregator._extract_source_from_filename("hkia_mailchimp_feed") == "mailchimp"
assert aggregator._extract_source_from_filename("hkia_podcast_episode") == "podcast"
assert aggregator._extract_source_from_filename("hkia_hvacrschool_article") == "hvacrschool"
assert aggregator._extract_source_from_filename("unknown_source") == "unknown"
@patch('src.content_analysis.intelligence_aggregator.IntelligenceAggregator._load_hkia_content')
@patch('src.content_analysis.intelligence_aggregator.IntelligenceAggregator._analyze_hkia_content')
def test_generate_daily_intelligence(self, mock_analyze, mock_load, aggregator, sample_content_items):
"""Test daily intelligence report generation"""
# Mock content loading
mock_load.return_value = sample_content_items
# Mock analysis results
mock_analyze.return_value = {
'content_classified': 3,
'topic_distribution': {'hvac_systems': {'count': 2}, 'maintenance': {'count': 1}},
'engagement_summary': {'youtube': {'total_items': 1}},
'trending_keywords': [{'keyword': 'hvac', 'frequency': 3}],
'content_gaps': [],
'sentiment_overview': {'avg_sentiment': 0.5}
}
# Generate report
test_date = datetime(2025, 8, 28)
report = aggregator.generate_daily_intelligence(test_date)
# Verify report structure
assert 'report_date' in report
assert 'generated_at' in report
assert 'hkia_analysis' in report
assert 'competitor_analysis' in report
assert 'strategic_insights' in report
assert 'meta' in report
assert report['report_date'] == '2025-08-28'
assert report['meta']['total_hkia_items'] == 3
def test_load_hkia_content_no_files(self, aggregator, temp_data_dir):
"""Test content loading when no markdown files exist"""
test_date = datetime(2025, 8, 28)
content = aggregator._load_hkia_content(test_date)
assert content == []
def test_load_hkia_content_with_files(self, aggregator, temp_data_dir, sample_markdown_content):
"""Test content loading with markdown files"""
# Create test files
md_dir = temp_data_dir / "markdown_current"
(md_dir / "hkia_youtube_20250827.md").write_text(sample_markdown_content)
(md_dir / "hkia_instagram_20250827.md").write_text("# ID: post1\n\n## Title: Test Post")
test_date = datetime(2025, 8, 28)
content = aggregator._load_hkia_content(test_date)
assert len(content) >= 2 # Should load from both files
@patch('src.content_analysis.intelligence_aggregator.ClaudeHaikuAnalyzer')
def test_analyze_hkia_content_with_claude(self, mock_claude_class, aggregator, sample_content_items):
"""Test HKIA content analysis with Claude analyzer"""
# Mock Claude analyzer
mock_analyzer = Mock()
mock_analyzer.analyze_content_batch.return_value = [
{'topics': ['hvac_systems'], 'sentiment': 0.7, 'difficulty': 'intermediate'},
{'topics': ['maintenance'], 'sentiment': 0.5, 'difficulty': 'beginner'},
{'topics': ['controls'], 'sentiment': 0.6, 'difficulty': 'advanced'}
]
mock_claude_class.return_value = mock_analyzer
# Re-initialize aggregator to enable Claude analyzer
aggregator.claude_analyzer = mock_analyzer
result = aggregator._analyze_hkia_content(sample_content_items)
assert result['content_classified'] == 3
assert 'topic_distribution' in result
assert 'engagement_summary' in result
assert 'trending_keywords' in result
def test_analyze_hkia_content_without_claude(self, aggregator, sample_content_items):
"""Test HKIA content analysis without Claude analyzer (fallback mode)"""
# Ensure no Claude analyzer
aggregator.claude_analyzer = None
result = aggregator._analyze_hkia_content(sample_content_items)
assert result['content_classified'] == 0
assert 'topic_distribution' in result
assert 'engagement_summary' in result
assert 'trending_keywords' in result
# Should still have engagement analysis and keyword extraction
assert len(result['engagement_summary']) > 0
def test_calculate_topic_distribution(self, aggregator):
"""Test topic distribution calculation"""
analyses = [
{'topics': ['hvac_systems'], 'sentiment': 0.7},
{'topics': ['hvac_systems', 'maintenance'], 'sentiment': 0.5},
{'topics': ['maintenance'], 'sentiment': 0.6}
]
distribution = aggregator._calculate_topic_distribution(analyses)
assert 'hvac_systems' in distribution
assert 'maintenance' in distribution
assert distribution['hvac_systems']['count'] == 2
assert distribution['maintenance']['count'] == 2
assert abs(distribution['hvac_systems']['avg_sentiment'] - 0.6) < 0.1
def test_calculate_sentiment_overview(self, aggregator):
"""Test sentiment overview calculation"""
analyses = [
{'sentiment': 0.7},
{'sentiment': 0.5},
{'sentiment': 0.6}
]
overview = aggregator._calculate_sentiment_overview(analyses)
assert 'avg_sentiment' in overview
assert 'sentiment_distribution' in overview
assert abs(overview['avg_sentiment'] - 0.6) < 0.1
def test_identify_content_gaps(self, aggregator):
"""Test content gap identification"""
topic_distribution = {
'hvac_systems': {'count': 10},
'maintenance': {'count': 1}, # Low coverage
'installation': {'count': 8},
'troubleshooting': {'count': 1} # Low coverage
}
gaps = aggregator._identify_content_gaps(topic_distribution)
assert len(gaps) > 0
assert any('maintenance' in gap for gap in gaps)
assert any('troubleshooting' in gap for gap in gaps)
def test_generate_strategic_insights(self, aggregator):
"""Test strategic insights generation"""
hkia_analysis = {
'topic_distribution': {
'maintenance': {'count': 1},
'installation': {'count': 8}
},
'trending_keywords': [{'keyword': 'heat pump', 'frequency': 20}],
'engagement_summary': {
'youtube': {'avg_engagement_rate': 0.02}
},
'sentiment_overview': {'avg_sentiment': 0.3}
}
competitor_analysis = {}
insights = aggregator._generate_strategic_insights(hkia_analysis, competitor_analysis)
assert 'content_opportunities' in insights
assert 'performance_insights' in insights
assert 'competitive_advantages' in insights
assert 'areas_for_improvement' in insights
# Should identify content opportunities based on trending keywords
assert len(insights['content_opportunities']) > 0
def test_save_intelligence_report(self, aggregator, temp_data_dir):
"""Test intelligence report saving"""
report = {
'report_date': '2025-08-28',
'test_data': 'sample'
}
test_date = datetime(2025, 8, 28)
saved_file = aggregator._save_intelligence_report(report, test_date, 'daily')
assert saved_file.exists()
assert 'hkia_intelligence_2025-08-28.json' in saved_file.name
# Verify content
with open(saved_file, 'r') as f:
saved_report = json.load(f)
assert saved_report['report_date'] == '2025-08-28'
def test_generate_weekly_intelligence(self, aggregator, temp_data_dir):
"""Test weekly intelligence generation"""
# Create sample daily reports
daily_dir = temp_data_dir / "intelligence" / "daily"
for i in range(7):
date = datetime(2025, 8, 21) + timedelta(days=i)
date_str = date.strftime('%Y-%m-%d')
report = {
'report_date': date_str,
'hkia_analysis': {
'content_classified': 10,
'trending_keywords': [{'keyword': 'hvac', 'frequency': 5}]
},
'meta': {'total_hkia_items': 100}
}
report_file = daily_dir / f"hkia_intelligence_{date_str}.json"
with open(report_file, 'w') as f:
json.dump(report, f)
# Generate weekly report
end_date = datetime(2025, 8, 28)
weekly_report = aggregator.generate_weekly_intelligence(end_date)
assert 'period_start' in weekly_report
assert 'period_end' in weekly_report
assert 'summary' in weekly_report
assert 'daily_reports_included' in weekly_report
def test_error_handling_file_operations(self, aggregator):
"""Test error handling in file operations"""
# Test parsing non-existent file
fake_file = Path("/nonexistent/file.md")
items = aggregator._parse_markdown_file(fake_file)
assert items == []
# Test parsing malformed content
malformed_content = "This is not properly formatted markdown"
item = aggregator._parse_content_item(malformed_content, "test")
assert item is None
def test_empty_content_analysis(self, aggregator):
"""Test analysis with empty content list"""
result = aggregator._analyze_hkia_content([])
assert result['content_classified'] == 0
assert result['topic_distribution'] == {}
assert result['trending_keywords'] == []
assert result['content_gaps'] == []
@patch('builtins.open', side_effect=IOError("File access error"))
def test_file_access_error_handling(self, mock_open, aggregator, temp_data_dir):
"""Test handling of file access errors"""
test_date = datetime(2025, 8, 28)
# Should handle file access errors gracefully
content = aggregator._load_hkia_content(test_date)
assert content == []
def test_numeric_field_edge_cases(self, aggregator):
"""Test numeric field extraction edge cases"""
item = {
'views': '', # Empty string
'likes': 'N/A', # Non-numeric string
'comments': None, # None value
'view_count': '1.5K' # Non-standard format
}
aggregator._extract_numeric_fields(item)
# All should convert to 0 for invalid formats
assert item['views'] == 0
assert item['likes'] == 0
assert item['comments'] == 0
assert item['view_count'] == 0
def test_intelligence_directory_permissions(self, aggregator, temp_data_dir):
"""Test intelligence directory creation with proper permissions"""
# Remove intelligence directory to test recreation
intelligence_dir = temp_data_dir / "intelligence"
if intelligence_dir.exists():
import shutil
shutil.rmtree(intelligence_dir)
# Re-initialize aggregator
new_aggregator = IntelligenceAggregator(temp_data_dir)
assert new_aggregator.intelligence_dir.exists()
assert (new_aggregator.intelligence_dir / "daily").exists()
if __name__ == "__main__":
pytest.main([__file__, "-v", "--cov=src.content_analysis.intelligence_aggregator", "--cov-report=term-missing"])