Major enhancements to HKIA content analysis system: CRITICAL FIXES: • Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly) • YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment) • Instagram: 7.37% average engagement rate across 20 posts • High performer detection operational (1 YouTube + 20 Instagram above thresholds) CONTENT ANALYSIS SYSTEM: • Add Claude Haiku analyzer for HVAC content classification • Add engagement analyzer with source-specific algorithms • Add keyword extractor with 100+ HVAC-specific terms • Add intelligence aggregator for daily JSON reports • Add comprehensive unit test suite (73 tests, 90% coverage target) ARCHITECTURE: • Extend BaseScraper with optional AI analysis capabilities • Add content analysis orchestrator with CLI interface • Add competitive intelligence module structure • Maintain backward compatibility with existing scrapers INTELLIGENCE FEATURES: • Daily intelligence reports with strategic insights • Trending keyword analysis (813 refrigeration, 701 service mentions) • Content opportunity identification • Multi-source engagement benchmarking • HVAC-specific topic and product categorization PRODUCTION READY: • Claude Haiku API integration validated ($15-25/month estimated) • Graceful degradation when API unavailable • Comprehensive logging and error handling • State management for analytics tracking Ready for Phase 2: Competitive Intelligence Infrastructure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
500 lines
No EOL
18 KiB
Python
500 lines
No EOL
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive Unit Tests for Intelligence Aggregator
|
|
|
|
Tests intelligence report generation, markdown parsing,
|
|
content analysis coordination, and strategic insights.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import Mock, patch, mock_open
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
import json
|
|
import sys
|
|
|
|
# Add src to path for imports
|
|
if str(Path(__file__).parent.parent) not in sys.path:
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.content_analysis.intelligence_aggregator import IntelligenceAggregator
|
|
|
|
|
|
class TestIntelligenceAggregator:
|
|
"""Test suite for IntelligenceAggregator"""
|
|
|
|
@pytest.fixture
|
|
def temp_data_dir(self, tmp_path):
|
|
"""Create temporary data directory structure"""
|
|
data_dir = tmp_path / "data"
|
|
data_dir.mkdir()
|
|
|
|
# Create required subdirectories
|
|
(data_dir / "intelligence" / "daily").mkdir(parents=True)
|
|
(data_dir / "intelligence" / "weekly").mkdir(parents=True)
|
|
(data_dir / "intelligence" / "monthly").mkdir(parents=True)
|
|
(data_dir / "markdown_current").mkdir()
|
|
|
|
return data_dir
|
|
|
|
@pytest.fixture
|
|
def aggregator(self, temp_data_dir):
|
|
"""Create intelligence aggregator instance with temp directory"""
|
|
return IntelligenceAggregator(temp_data_dir)
|
|
|
|
@pytest.fixture
|
|
def sample_markdown_content(self):
|
|
"""Sample markdown content for testing parsing"""
|
|
return """# ID: video1
|
|
|
|
## Title: HVAC Installation Guide
|
|
|
|
## Type: video
|
|
|
|
## Author: HVAC Know It All
|
|
|
|
## Link: https://www.youtube.com/watch?v=video1
|
|
|
|
## Upload Date: 2025-08-27
|
|
|
|
## Views: 5000
|
|
|
|
## Likes: 250
|
|
|
|
## Comments: 30
|
|
|
|
## Engagement Rate: 5.6%
|
|
|
|
## Description:
|
|
Learn professional HVAC installation techniques in this comprehensive guide.
|
|
|
|
# ID: video2
|
|
|
|
## Title: Heat Pump Maintenance
|
|
|
|
## Type: video
|
|
|
|
## Views: 3000
|
|
|
|
## Likes: 150
|
|
|
|
## Comments: 20
|
|
|
|
## Description:
|
|
Essential heat pump maintenance procedures for optimal performance.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def sample_content_items(self):
|
|
"""Sample content items for testing analysis"""
|
|
return [
|
|
{
|
|
'id': 'item1',
|
|
'title': 'HVAC Installation Guide',
|
|
'source': 'youtube',
|
|
'views': 5000,
|
|
'likes': 250,
|
|
'comments': 30,
|
|
'content': 'Professional HVAC installation techniques, heat pump setup, refrigeration cycle',
|
|
'upload_date': '2025-08-27'
|
|
},
|
|
{
|
|
'id': 'item2',
|
|
'title': 'AC Troubleshooting',
|
|
'source': 'wordpress',
|
|
'likes': 45,
|
|
'comments': 8,
|
|
'content': 'Air conditioning repair, compressor issues, refrigerant leaks',
|
|
'upload_date': '2025-08-26'
|
|
},
|
|
{
|
|
'id': 'item3',
|
|
'title': 'Smart Thermostat Install',
|
|
'source': 'instagram',
|
|
'likes': 120,
|
|
'comments': 15,
|
|
'content': 'Smart thermostat wiring, HVAC controls, energy efficiency',
|
|
'upload_date': '2025-08-25'
|
|
}
|
|
]
|
|
|
|
def test_initialization(self, temp_data_dir):
|
|
"""Test aggregator initialization and directory creation"""
|
|
|
|
aggregator = IntelligenceAggregator(temp_data_dir)
|
|
|
|
assert aggregator.data_dir == temp_data_dir
|
|
assert aggregator.intelligence_dir == temp_data_dir / "intelligence"
|
|
assert aggregator.intelligence_dir.exists()
|
|
assert (aggregator.intelligence_dir / "daily").exists()
|
|
assert (aggregator.intelligence_dir / "weekly").exists()
|
|
assert (aggregator.intelligence_dir / "monthly").exists()
|
|
|
|
def test_parse_markdown_file(self, aggregator, temp_data_dir, sample_markdown_content):
|
|
"""Test markdown file parsing"""
|
|
|
|
# Create test markdown file
|
|
md_file = temp_data_dir / "markdown_current" / "hkia_youtube_test.md"
|
|
md_file.write_text(sample_markdown_content, encoding='utf-8')
|
|
|
|
items = aggregator._parse_markdown_file(md_file)
|
|
|
|
assert len(items) == 2
|
|
|
|
# Check first item
|
|
item1 = items[0]
|
|
assert item1['id'] == 'video1'
|
|
assert item1['title'] == 'HVAC Installation Guide'
|
|
assert item1['source'] == 'youtube'
|
|
assert item1['views'] == 5000
|
|
assert item1['likes'] == 250
|
|
assert item1['comments'] == 30
|
|
|
|
# Check second item
|
|
item2 = items[1]
|
|
assert item2['id'] == 'video2'
|
|
assert item2['title'] == 'Heat Pump Maintenance'
|
|
assert item2['views'] == 3000
|
|
|
|
def test_parse_content_item(self, aggregator):
|
|
"""Test individual content item parsing"""
|
|
|
|
item_content = """video1
|
|
|
|
## Title: Test Video
|
|
|
|
## Views: 1,500
|
|
|
|
## Likes: 75
|
|
|
|
## Comments: 10
|
|
|
|
## Description:
|
|
Test video description here.
|
|
"""
|
|
|
|
item = aggregator._parse_content_item(item_content, "youtube_test")
|
|
|
|
assert item['id'] == 'video1'
|
|
assert item['title'] == 'Test Video'
|
|
assert item['views'] == 1500 # Comma should be removed
|
|
assert item['likes'] == 75
|
|
assert item['comments'] == 10
|
|
assert item['source'] == 'youtube'
|
|
|
|
def test_extract_numeric_fields(self, aggregator):
|
|
"""Test numeric field extraction and conversion"""
|
|
|
|
item = {
|
|
'views': '10,000',
|
|
'likes': '500',
|
|
'comments': '50',
|
|
'invalid_number': 'abc'
|
|
}
|
|
|
|
aggregator._extract_numeric_fields(item)
|
|
|
|
assert item['views'] == 10000
|
|
assert item['likes'] == 500
|
|
assert item['comments'] == 50
|
|
# Invalid numbers should become 0
|
|
# Note: 'invalid_number' not in numeric_fields list, so unchanged
|
|
|
|
def test_extract_source_from_filename(self, aggregator):
|
|
"""Test source extraction from filenames"""
|
|
|
|
assert aggregator._extract_source_from_filename("hkia_youtube_20250827") == "youtube"
|
|
assert aggregator._extract_source_from_filename("hkia_instagram_test") == "instagram"
|
|
assert aggregator._extract_source_from_filename("hkia_wordpress_latest") == "wordpress"
|
|
assert aggregator._extract_source_from_filename("hkia_mailchimp_feed") == "mailchimp"
|
|
assert aggregator._extract_source_from_filename("hkia_podcast_episode") == "podcast"
|
|
assert aggregator._extract_source_from_filename("hkia_hvacrschool_article") == "hvacrschool"
|
|
assert aggregator._extract_source_from_filename("unknown_source") == "unknown"
|
|
|
|
@patch('src.content_analysis.intelligence_aggregator.IntelligenceAggregator._load_hkia_content')
|
|
@patch('src.content_analysis.intelligence_aggregator.IntelligenceAggregator._analyze_hkia_content')
|
|
def test_generate_daily_intelligence(self, mock_analyze, mock_load, aggregator, sample_content_items):
|
|
"""Test daily intelligence report generation"""
|
|
|
|
# Mock content loading
|
|
mock_load.return_value = sample_content_items
|
|
|
|
# Mock analysis results
|
|
mock_analyze.return_value = {
|
|
'content_classified': 3,
|
|
'topic_distribution': {'hvac_systems': {'count': 2}, 'maintenance': {'count': 1}},
|
|
'engagement_summary': {'youtube': {'total_items': 1}},
|
|
'trending_keywords': [{'keyword': 'hvac', 'frequency': 3}],
|
|
'content_gaps': [],
|
|
'sentiment_overview': {'avg_sentiment': 0.5}
|
|
}
|
|
|
|
# Generate report
|
|
test_date = datetime(2025, 8, 28)
|
|
report = aggregator.generate_daily_intelligence(test_date)
|
|
|
|
# Verify report structure
|
|
assert 'report_date' in report
|
|
assert 'generated_at' in report
|
|
assert 'hkia_analysis' in report
|
|
assert 'competitor_analysis' in report
|
|
assert 'strategic_insights' in report
|
|
assert 'meta' in report
|
|
|
|
assert report['report_date'] == '2025-08-28'
|
|
assert report['meta']['total_hkia_items'] == 3
|
|
|
|
def test_load_hkia_content_no_files(self, aggregator, temp_data_dir):
|
|
"""Test content loading when no markdown files exist"""
|
|
|
|
test_date = datetime(2025, 8, 28)
|
|
content = aggregator._load_hkia_content(test_date)
|
|
|
|
assert content == []
|
|
|
|
def test_load_hkia_content_with_files(self, aggregator, temp_data_dir, sample_markdown_content):
|
|
"""Test content loading with markdown files"""
|
|
|
|
# Create test files
|
|
md_dir = temp_data_dir / "markdown_current"
|
|
(md_dir / "hkia_youtube_20250827.md").write_text(sample_markdown_content)
|
|
(md_dir / "hkia_instagram_20250827.md").write_text("# ID: post1\n\n## Title: Test Post")
|
|
|
|
test_date = datetime(2025, 8, 28)
|
|
content = aggregator._load_hkia_content(test_date)
|
|
|
|
assert len(content) >= 2 # Should load from both files
|
|
|
|
@patch('src.content_analysis.intelligence_aggregator.ClaudeHaikuAnalyzer')
|
|
def test_analyze_hkia_content_with_claude(self, mock_claude_class, aggregator, sample_content_items):
|
|
"""Test HKIA content analysis with Claude analyzer"""
|
|
|
|
# Mock Claude analyzer
|
|
mock_analyzer = Mock()
|
|
mock_analyzer.analyze_content_batch.return_value = [
|
|
{'topics': ['hvac_systems'], 'sentiment': 0.7, 'difficulty': 'intermediate'},
|
|
{'topics': ['maintenance'], 'sentiment': 0.5, 'difficulty': 'beginner'},
|
|
{'topics': ['controls'], 'sentiment': 0.6, 'difficulty': 'advanced'}
|
|
]
|
|
mock_claude_class.return_value = mock_analyzer
|
|
|
|
# Re-initialize aggregator to enable Claude analyzer
|
|
aggregator.claude_analyzer = mock_analyzer
|
|
|
|
result = aggregator._analyze_hkia_content(sample_content_items)
|
|
|
|
assert result['content_classified'] == 3
|
|
assert 'topic_distribution' in result
|
|
assert 'engagement_summary' in result
|
|
assert 'trending_keywords' in result
|
|
|
|
def test_analyze_hkia_content_without_claude(self, aggregator, sample_content_items):
|
|
"""Test HKIA content analysis without Claude analyzer (fallback mode)"""
|
|
|
|
# Ensure no Claude analyzer
|
|
aggregator.claude_analyzer = None
|
|
|
|
result = aggregator._analyze_hkia_content(sample_content_items)
|
|
|
|
assert result['content_classified'] == 0
|
|
assert 'topic_distribution' in result
|
|
assert 'engagement_summary' in result
|
|
assert 'trending_keywords' in result
|
|
|
|
# Should still have engagement analysis and keyword extraction
|
|
assert len(result['engagement_summary']) > 0
|
|
|
|
def test_calculate_topic_distribution(self, aggregator):
|
|
"""Test topic distribution calculation"""
|
|
|
|
analyses = [
|
|
{'topics': ['hvac_systems'], 'sentiment': 0.7},
|
|
{'topics': ['hvac_systems', 'maintenance'], 'sentiment': 0.5},
|
|
{'topics': ['maintenance'], 'sentiment': 0.6}
|
|
]
|
|
|
|
distribution = aggregator._calculate_topic_distribution(analyses)
|
|
|
|
assert 'hvac_systems' in distribution
|
|
assert 'maintenance' in distribution
|
|
assert distribution['hvac_systems']['count'] == 2
|
|
assert distribution['maintenance']['count'] == 2
|
|
assert abs(distribution['hvac_systems']['avg_sentiment'] - 0.6) < 0.1
|
|
|
|
def test_calculate_sentiment_overview(self, aggregator):
|
|
"""Test sentiment overview calculation"""
|
|
|
|
analyses = [
|
|
{'sentiment': 0.7},
|
|
{'sentiment': 0.5},
|
|
{'sentiment': 0.6}
|
|
]
|
|
|
|
overview = aggregator._calculate_sentiment_overview(analyses)
|
|
|
|
assert 'avg_sentiment' in overview
|
|
assert 'sentiment_distribution' in overview
|
|
assert abs(overview['avg_sentiment'] - 0.6) < 0.1
|
|
|
|
def test_identify_content_gaps(self, aggregator):
|
|
"""Test content gap identification"""
|
|
|
|
topic_distribution = {
|
|
'hvac_systems': {'count': 10},
|
|
'maintenance': {'count': 1}, # Low coverage
|
|
'installation': {'count': 8},
|
|
'troubleshooting': {'count': 1} # Low coverage
|
|
}
|
|
|
|
gaps = aggregator._identify_content_gaps(topic_distribution)
|
|
|
|
assert len(gaps) > 0
|
|
assert any('maintenance' in gap for gap in gaps)
|
|
assert any('troubleshooting' in gap for gap in gaps)
|
|
|
|
def test_generate_strategic_insights(self, aggregator):
|
|
"""Test strategic insights generation"""
|
|
|
|
hkia_analysis = {
|
|
'topic_distribution': {
|
|
'maintenance': {'count': 1},
|
|
'installation': {'count': 8}
|
|
},
|
|
'trending_keywords': [{'keyword': 'heat pump', 'frequency': 20}],
|
|
'engagement_summary': {
|
|
'youtube': {'avg_engagement_rate': 0.02}
|
|
},
|
|
'sentiment_overview': {'avg_sentiment': 0.3}
|
|
}
|
|
|
|
competitor_analysis = {}
|
|
|
|
insights = aggregator._generate_strategic_insights(hkia_analysis, competitor_analysis)
|
|
|
|
assert 'content_opportunities' in insights
|
|
assert 'performance_insights' in insights
|
|
assert 'competitive_advantages' in insights
|
|
assert 'areas_for_improvement' in insights
|
|
|
|
# Should identify content opportunities based on trending keywords
|
|
assert len(insights['content_opportunities']) > 0
|
|
|
|
def test_save_intelligence_report(self, aggregator, temp_data_dir):
|
|
"""Test intelligence report saving"""
|
|
|
|
report = {
|
|
'report_date': '2025-08-28',
|
|
'test_data': 'sample'
|
|
}
|
|
|
|
test_date = datetime(2025, 8, 28)
|
|
saved_file = aggregator._save_intelligence_report(report, test_date, 'daily')
|
|
|
|
assert saved_file.exists()
|
|
assert 'hkia_intelligence_2025-08-28.json' in saved_file.name
|
|
|
|
# Verify content
|
|
with open(saved_file, 'r') as f:
|
|
saved_report = json.load(f)
|
|
assert saved_report['report_date'] == '2025-08-28'
|
|
|
|
def test_generate_weekly_intelligence(self, aggregator, temp_data_dir):
|
|
"""Test weekly intelligence generation"""
|
|
|
|
# Create sample daily reports
|
|
daily_dir = temp_data_dir / "intelligence" / "daily"
|
|
|
|
for i in range(7):
|
|
date = datetime(2025, 8, 21) + timedelta(days=i)
|
|
date_str = date.strftime('%Y-%m-%d')
|
|
report = {
|
|
'report_date': date_str,
|
|
'hkia_analysis': {
|
|
'content_classified': 10,
|
|
'trending_keywords': [{'keyword': 'hvac', 'frequency': 5}]
|
|
},
|
|
'meta': {'total_hkia_items': 100}
|
|
}
|
|
|
|
report_file = daily_dir / f"hkia_intelligence_{date_str}.json"
|
|
with open(report_file, 'w') as f:
|
|
json.dump(report, f)
|
|
|
|
# Generate weekly report
|
|
end_date = datetime(2025, 8, 28)
|
|
weekly_report = aggregator.generate_weekly_intelligence(end_date)
|
|
|
|
assert 'period_start' in weekly_report
|
|
assert 'period_end' in weekly_report
|
|
assert 'summary' in weekly_report
|
|
assert 'daily_reports_included' in weekly_report
|
|
|
|
def test_error_handling_file_operations(self, aggregator):
|
|
"""Test error handling in file operations"""
|
|
|
|
# Test parsing non-existent file
|
|
fake_file = Path("/nonexistent/file.md")
|
|
items = aggregator._parse_markdown_file(fake_file)
|
|
assert items == []
|
|
|
|
# Test parsing malformed content
|
|
malformed_content = "This is not properly formatted markdown"
|
|
item = aggregator._parse_content_item(malformed_content, "test")
|
|
assert item is None
|
|
|
|
def test_empty_content_analysis(self, aggregator):
|
|
"""Test analysis with empty content list"""
|
|
|
|
result = aggregator._analyze_hkia_content([])
|
|
|
|
assert result['content_classified'] == 0
|
|
assert result['topic_distribution'] == {}
|
|
assert result['trending_keywords'] == []
|
|
assert result['content_gaps'] == []
|
|
|
|
@patch('builtins.open', side_effect=IOError("File access error"))
|
|
def test_file_access_error_handling(self, mock_open, aggregator, temp_data_dir):
|
|
"""Test handling of file access errors"""
|
|
|
|
test_date = datetime(2025, 8, 28)
|
|
|
|
# Should handle file access errors gracefully
|
|
content = aggregator._load_hkia_content(test_date)
|
|
assert content == []
|
|
|
|
def test_numeric_field_edge_cases(self, aggregator):
|
|
"""Test numeric field extraction edge cases"""
|
|
|
|
item = {
|
|
'views': '', # Empty string
|
|
'likes': 'N/A', # Non-numeric string
|
|
'comments': None, # None value
|
|
'view_count': '1.5K' # Non-standard format
|
|
}
|
|
|
|
aggregator._extract_numeric_fields(item)
|
|
|
|
# All should convert to 0 for invalid formats
|
|
assert item['views'] == 0
|
|
assert item['likes'] == 0
|
|
assert item['comments'] == 0
|
|
assert item['view_count'] == 0
|
|
|
|
def test_intelligence_directory_permissions(self, aggregator, temp_data_dir):
|
|
"""Test intelligence directory creation with proper permissions"""
|
|
|
|
# Remove intelligence directory to test recreation
|
|
intelligence_dir = temp_data_dir / "intelligence"
|
|
if intelligence_dir.exists():
|
|
import shutil
|
|
shutil.rmtree(intelligence_dir)
|
|
|
|
# Re-initialize aggregator
|
|
new_aggregator = IntelligenceAggregator(temp_data_dir)
|
|
|
|
assert new_aggregator.intelligence_dir.exists()
|
|
assert (new_aggregator.intelligence_dir / "daily").exists()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v", "--cov=src.content_analysis.intelligence_aggregator", "--cov-report=term-missing"]) |