Major enhancements to HKIA content analysis system: CRITICAL FIXES: • Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly) • YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment) • Instagram: 7.37% average engagement rate across 20 posts • High performer detection operational (1 YouTube + 20 Instagram above thresholds) CONTENT ANALYSIS SYSTEM: • Add Claude Haiku analyzer for HVAC content classification • Add engagement analyzer with source-specific algorithms • Add keyword extractor with 100+ HVAC-specific terms • Add intelligence aggregator for daily JSON reports • Add comprehensive unit test suite (73 tests, 90% coverage target) ARCHITECTURE: • Extend BaseScraper with optional AI analysis capabilities • Add content analysis orchestrator with CLI interface • Add competitive intelligence module structure • Maintain backward compatibility with existing scrapers INTELLIGENCE FEATURES: • Daily intelligence reports with strategic insights • Trending keyword analysis (813 refrigeration, 701 service mentions) • Content opportunity identification • Multi-source engagement benchmarking • HVAC-specific topic and product categorization PRODUCTION READY: • Claude Haiku API integration validated ($15-25/month estimated) • Graceful degradation when API unavailable • Comprehensive logging and error handling • State management for analytics tracking Ready for Phase 2: Competitive Intelligence Infrastructure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
438 lines
No EOL
17 KiB
Python
438 lines
No EOL
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive Unit Tests for Claude Haiku Analyzer
|
|
|
|
Tests Claude API integration, content classification,
|
|
batch processing, and error handling.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add src to path for imports
|
|
if str(Path(__file__).parent.parent) not in sys.path:
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.content_analysis.claude_analyzer import ClaudeHaikuAnalyzer
|
|
|
|
|
|
class TestClaudeHaikuAnalyzer:
|
|
"""Test suite for ClaudeHaikuAnalyzer"""
|
|
|
|
@pytest.fixture
|
|
def mock_claude_client(self):
|
|
"""Create mock Claude client"""
|
|
mock_client = Mock()
|
|
mock_response = Mock()
|
|
mock_response.content = [Mock()]
|
|
mock_response.content[0].text = """[
|
|
{
|
|
"topics": ["hvac_systems", "installation"],
|
|
"products": ["heat_pump"],
|
|
"difficulty": "intermediate",
|
|
"content_type": "tutorial",
|
|
"sentiment": 0.7,
|
|
"hvac_relevance": 0.9,
|
|
"keywords": ["heat pump", "installation", "efficiency"]
|
|
}
|
|
]"""
|
|
mock_client.messages.create.return_value = mock_response
|
|
return mock_client
|
|
|
|
@pytest.fixture
|
|
def analyzer_with_mock_client(self, mock_claude_client):
|
|
"""Create analyzer with mocked Claude client"""
|
|
with patch('src.content_analysis.claude_analyzer.anthropic.Anthropic') as mock_anthropic:
|
|
mock_anthropic.return_value = mock_claude_client
|
|
analyzer = ClaudeHaikuAnalyzer("test-api-key")
|
|
analyzer.client = mock_claude_client
|
|
return analyzer
|
|
|
|
@pytest.fixture
|
|
def sample_content_items(self):
|
|
"""Sample content items for testing"""
|
|
return [
|
|
{
|
|
'id': 'item1',
|
|
'title': 'Heat Pump Installation Guide',
|
|
'content': 'Complete guide to installing high-efficiency heat pumps for residential applications.',
|
|
'source': 'youtube'
|
|
},
|
|
{
|
|
'id': 'item2',
|
|
'title': 'AC Troubleshooting',
|
|
'content': 'Common air conditioning problems and how to diagnose compressor issues.',
|
|
'source': 'blog'
|
|
},
|
|
{
|
|
'id': 'item3',
|
|
'title': 'Thermostat Wiring',
|
|
'content': 'Step-by-step wiring instructions for smart thermostats and HVAC controls.',
|
|
'source': 'instagram'
|
|
}
|
|
]
|
|
|
|
def test_initialization_with_api_key(self):
|
|
"""Test analyzer initialization with API key"""
|
|
|
|
with patch('src.content_analysis.claude_analyzer.anthropic.Anthropic') as mock_anthropic:
|
|
analyzer = ClaudeHaikuAnalyzer("test-api-key")
|
|
|
|
assert analyzer.api_key == "test-api-key"
|
|
assert analyzer.model_name == "claude-3-haiku-20240307"
|
|
assert analyzer.max_tokens == 4000
|
|
assert analyzer.temperature == 0.1
|
|
mock_anthropic.assert_called_once_with(api_key="test-api-key")
|
|
|
|
def test_initialization_without_api_key(self):
|
|
"""Test analyzer initialization without API key raises error"""
|
|
|
|
with pytest.raises(ValueError, match="ANTHROPIC_API_KEY is required"):
|
|
ClaudeHaikuAnalyzer(None)
|
|
|
|
def test_analyze_single_content(self, analyzer_with_mock_client, sample_content_items):
|
|
"""Test single content item analysis"""
|
|
|
|
item = sample_content_items[0]
|
|
result = analyzer_with_mock_client.analyze_content(item)
|
|
|
|
# Verify API call structure
|
|
analyzer_with_mock_client.client.messages.create.assert_called_once()
|
|
call_args = analyzer_with_mock_client.client.messages.create.call_args
|
|
|
|
assert call_args[1]['model'] == "claude-3-haiku-20240307"
|
|
assert call_args[1]['max_tokens'] == 4000
|
|
assert call_args[1]['temperature'] == 0.1
|
|
|
|
# Verify result structure
|
|
assert 'topics' in result
|
|
assert 'products' in result
|
|
assert 'difficulty' in result
|
|
assert 'content_type' in result
|
|
assert 'sentiment' in result
|
|
assert 'hvac_relevance' in result
|
|
assert 'keywords' in result
|
|
|
|
def test_analyze_content_batch(self, analyzer_with_mock_client, sample_content_items):
|
|
"""Test batch content analysis"""
|
|
|
|
# Mock batch response
|
|
batch_response = Mock()
|
|
batch_response.content = [Mock()]
|
|
batch_response.content[0].text = """[
|
|
{
|
|
"topics": ["hvac_systems"],
|
|
"products": ["heat_pump"],
|
|
"difficulty": "intermediate",
|
|
"content_type": "tutorial",
|
|
"sentiment": 0.7,
|
|
"hvac_relevance": 0.9,
|
|
"keywords": ["heat pump"]
|
|
},
|
|
{
|
|
"topics": ["troubleshooting"],
|
|
"products": ["air_conditioning"],
|
|
"difficulty": "advanced",
|
|
"content_type": "diagnostic",
|
|
"sentiment": 0.5,
|
|
"hvac_relevance": 0.8,
|
|
"keywords": ["ac repair"]
|
|
},
|
|
{
|
|
"topics": ["controls"],
|
|
"products": ["thermostat"],
|
|
"difficulty": "beginner",
|
|
"content_type": "tutorial",
|
|
"sentiment": 0.6,
|
|
"hvac_relevance": 0.7,
|
|
"keywords": ["thermostat wiring"]
|
|
}
|
|
]"""
|
|
analyzer_with_mock_client.client.messages.create.return_value = batch_response
|
|
|
|
results = analyzer_with_mock_client.analyze_content_batch(sample_content_items)
|
|
|
|
assert len(results) == 3
|
|
|
|
# Verify each result structure
|
|
for result in results:
|
|
assert 'topics' in result
|
|
assert 'products' in result
|
|
assert 'difficulty' in result
|
|
assert 'content_type' in result
|
|
assert 'sentiment' in result
|
|
assert 'hvac_relevance' in result
|
|
assert 'keywords' in result
|
|
|
|
def test_batch_processing_chunking(self, analyzer_with_mock_client):
|
|
"""Test batch processing with chunking for large item lists"""
|
|
|
|
# Create large list of content items
|
|
large_content_list = []
|
|
for i in range(15): # More than batch_size of 10
|
|
large_content_list.append({
|
|
'id': f'item{i}',
|
|
'title': f'HVAC Item {i}',
|
|
'content': f'Content for item {i}',
|
|
'source': 'test'
|
|
})
|
|
|
|
# Mock responses for multiple batches
|
|
response1 = Mock()
|
|
response1.content = [Mock()]
|
|
response1.content[0].text = '[' + ','.join([
|
|
'{"topics": ["hvac_systems"], "products": [], "difficulty": "intermediate", "content_type": "tutorial", "sentiment": 0.5, "hvac_relevance": 0.8, "keywords": []}'
|
|
] * 10) + ']'
|
|
|
|
response2 = Mock()
|
|
response2.content = [Mock()]
|
|
response2.content[0].text = '[' + ','.join([
|
|
'{"topics": ["maintenance"], "products": [], "difficulty": "beginner", "content_type": "guide", "sentiment": 0.6, "hvac_relevance": 0.7, "keywords": []}'
|
|
] * 5) + ']'
|
|
|
|
analyzer_with_mock_client.client.messages.create.side_effect = [response1, response2]
|
|
|
|
results = analyzer_with_mock_client.analyze_content_batch(large_content_list)
|
|
|
|
assert len(results) == 15
|
|
assert analyzer_with_mock_client.client.messages.create.call_count == 2
|
|
|
|
def test_create_analysis_prompt_single(self, analyzer_with_mock_client, sample_content_items):
|
|
"""Test analysis prompt creation for single item"""
|
|
|
|
item = sample_content_items[0]
|
|
prompt = analyzer_with_mock_client._create_analysis_prompt([item])
|
|
|
|
# Verify prompt contains expected elements
|
|
assert 'Heat Pump Installation Guide' in prompt
|
|
assert 'Complete guide to installing' in prompt
|
|
assert 'HVAC Content Analysis' in prompt
|
|
assert 'topics' in prompt
|
|
assert 'products' in prompt
|
|
assert 'difficulty' in prompt
|
|
|
|
def test_create_analysis_prompt_batch(self, analyzer_with_mock_client, sample_content_items):
|
|
"""Test analysis prompt creation for batch"""
|
|
|
|
prompt = analyzer_with_mock_client._create_analysis_prompt(sample_content_items)
|
|
|
|
# Should contain all items
|
|
assert 'Heat Pump Installation Guide' in prompt
|
|
assert 'AC Troubleshooting' in prompt
|
|
assert 'Thermostat Wiring' in prompt
|
|
|
|
# Should be structured as JSON array request
|
|
assert 'JSON array' in prompt
|
|
|
|
def test_parse_claude_response_valid_json(self, analyzer_with_mock_client):
|
|
"""Test parsing valid Claude JSON response"""
|
|
|
|
response_text = """[
|
|
{
|
|
"topics": ["hvac_systems"],
|
|
"products": ["heat_pump"],
|
|
"difficulty": "intermediate",
|
|
"content_type": "tutorial",
|
|
"sentiment": 0.7,
|
|
"hvac_relevance": 0.9,
|
|
"keywords": ["heat pump", "installation"]
|
|
}
|
|
]"""
|
|
|
|
results = analyzer_with_mock_client._parse_claude_response(response_text, 1)
|
|
|
|
assert len(results) == 1
|
|
assert results[0]['topics'] == ["hvac_systems"]
|
|
assert results[0]['products'] == ["heat_pump"]
|
|
assert results[0]['sentiment'] == 0.7
|
|
|
|
def test_parse_claude_response_invalid_json(self, analyzer_with_mock_client):
|
|
"""Test parsing invalid Claude JSON response"""
|
|
|
|
invalid_json = "This is not valid JSON"
|
|
|
|
results = analyzer_with_mock_client._parse_claude_response(invalid_json, 2)
|
|
|
|
# Should return fallback results
|
|
assert len(results) == 2
|
|
for result in results:
|
|
assert result['topics'] == []
|
|
assert result['products'] == []
|
|
assert result['difficulty'] == 'unknown'
|
|
assert result['content_type'] == 'unknown'
|
|
assert result['sentiment'] == 0
|
|
assert result['hvac_relevance'] == 0
|
|
assert result['keywords'] == []
|
|
|
|
def test_parse_claude_response_partial_json(self, analyzer_with_mock_client):
|
|
"""Test parsing partially valid JSON response"""
|
|
|
|
partial_json = """[
|
|
{
|
|
"topics": ["hvac_systems"],
|
|
"products": ["heat_pump"],
|
|
"difficulty": "intermediate"
|
|
// Missing some fields
|
|
}
|
|
]"""
|
|
|
|
results = analyzer_with_mock_client._parse_claude_response(partial_json, 1)
|
|
|
|
# Should still get fallback for malformed JSON
|
|
assert len(results) == 1
|
|
assert results[0]['topics'] == []
|
|
|
|
def test_create_fallback_analysis(self, analyzer_with_mock_client):
|
|
"""Test fallback analysis creation"""
|
|
|
|
fallback = analyzer_with_mock_client._create_fallback_analysis()
|
|
|
|
assert fallback['topics'] == []
|
|
assert fallback['products'] == []
|
|
assert fallback['difficulty'] == 'unknown'
|
|
assert fallback['content_type'] == 'unknown'
|
|
assert fallback['sentiment'] == 0
|
|
assert fallback['hvac_relevance'] == 0
|
|
assert fallback['keywords'] == []
|
|
|
|
def test_api_error_handling(self, analyzer_with_mock_client):
|
|
"""Test API error handling"""
|
|
|
|
# Mock API error
|
|
analyzer_with_mock_client.client.messages.create.side_effect = Exception("API Error")
|
|
|
|
item = {'id': 'test', 'title': 'Test', 'content': 'Test content', 'source': 'test'}
|
|
result = analyzer_with_mock_client.analyze_content(item)
|
|
|
|
# Should return fallback analysis
|
|
assert result['topics'] == []
|
|
assert result['difficulty'] == 'unknown'
|
|
|
|
def test_rate_limiting_backoff(self, analyzer_with_mock_client):
|
|
"""Test rate limiting and backoff behavior"""
|
|
|
|
# Mock rate limiting error followed by success
|
|
rate_limit_error = Exception("Rate limit exceeded")
|
|
success_response = Mock()
|
|
success_response.content = [Mock()]
|
|
success_response.content[0].text = '[{"topics": [], "products": [], "difficulty": "unknown", "content_type": "unknown", "sentiment": 0, "hvac_relevance": 0, "keywords": []}]'
|
|
|
|
analyzer_with_mock_client.client.messages.create.side_effect = [rate_limit_error, success_response]
|
|
|
|
with patch('time.sleep') as mock_sleep:
|
|
item = {'id': 'test', 'title': 'Test', 'content': 'Test content', 'source': 'test'}
|
|
result = analyzer_with_mock_client.analyze_content(item)
|
|
|
|
# Should have retried and succeeded
|
|
assert analyzer_with_mock_client.client.messages.create.call_count == 2
|
|
mock_sleep.assert_called_once()
|
|
|
|
def test_empty_content_handling(self, analyzer_with_mock_client):
|
|
"""Test handling of empty or minimal content"""
|
|
|
|
empty_items = [
|
|
{'id': 'empty1', 'title': '', 'content': '', 'source': 'test'},
|
|
{'id': 'empty2', 'title': 'Title Only', 'source': 'test'} # Missing content
|
|
]
|
|
|
|
results = analyzer_with_mock_client.analyze_content_batch(empty_items)
|
|
|
|
# Should still process and return results
|
|
assert len(results) == 2
|
|
|
|
def test_content_length_limits(self, analyzer_with_mock_client):
|
|
"""Test handling of very long content"""
|
|
|
|
long_content = {
|
|
'id': 'long1',
|
|
'title': 'Long Content Test',
|
|
'content': 'A' * 10000, # Very long content
|
|
'source': 'test'
|
|
}
|
|
|
|
# Should not crash with long content
|
|
result = analyzer_with_mock_client.analyze_content(long_content)
|
|
assert 'topics' in result
|
|
|
|
def test_special_characters_handling(self, analyzer_with_mock_client):
|
|
"""Test handling of special characters and encoding"""
|
|
|
|
special_content = {
|
|
'id': 'special1',
|
|
'title': 'Special Characters: "Quotes" & Symbols ®™',
|
|
'content': 'Content with émojis 🔧 and speciál çharaçters',
|
|
'source': 'test'
|
|
}
|
|
|
|
# Should handle special characters without errors
|
|
result = analyzer_with_mock_client.analyze_content(special_content)
|
|
assert 'topics' in result
|
|
|
|
def test_taxonomy_validation(self, analyzer_with_mock_client):
|
|
"""Test HVAC taxonomy validation in prompts"""
|
|
|
|
item = {'id': 'test', 'title': 'Test', 'content': 'Test', 'source': 'test'}
|
|
prompt = analyzer_with_mock_client._create_analysis_prompt([item])
|
|
|
|
# Should include HVAC topic categories
|
|
hvac_topics = ['hvac_systems', 'heat_pumps', 'air_conditioning', 'refrigeration',
|
|
'maintenance', 'installation', 'troubleshooting', 'controls']
|
|
for topic in hvac_topics:
|
|
assert topic in prompt
|
|
|
|
# Should include product categories
|
|
hvac_products = ['heat_pump', 'air_conditioner', 'furnace', 'boiler', 'thermostat',
|
|
'compressor', 'evaporator', 'condenser']
|
|
for product in hvac_products:
|
|
assert product in prompt
|
|
|
|
def test_model_configuration_validation(self, analyzer_with_mock_client):
|
|
"""Test model configuration parameters"""
|
|
|
|
assert analyzer_with_mock_client.model_name == "claude-3-haiku-20240307"
|
|
assert analyzer_with_mock_client.max_tokens == 4000
|
|
assert analyzer_with_mock_client.temperature == 0.1
|
|
assert analyzer_with_mock_client.batch_size == 10
|
|
|
|
@patch('src.content_analysis.claude_analyzer.logging')
|
|
def test_logging_functionality(self, mock_logging, analyzer_with_mock_client):
|
|
"""Test logging of analysis operations"""
|
|
|
|
item = {'id': 'test', 'title': 'Test', 'content': 'Test', 'source': 'test'}
|
|
analyzer_with_mock_client.analyze_content(item)
|
|
|
|
# Should have logged the operation
|
|
assert mock_logging.getLogger.called
|
|
|
|
def test_response_format_validation(self, analyzer_with_mock_client):
|
|
"""Test validation of response format from Claude"""
|
|
|
|
# Test with correctly formatted response
|
|
good_response = '''[{
|
|
"topics": ["hvac_systems"],
|
|
"products": ["heat_pump"],
|
|
"difficulty": "intermediate",
|
|
"content_type": "tutorial",
|
|
"sentiment": 0.7,
|
|
"hvac_relevance": 0.9,
|
|
"keywords": ["heat pump"]
|
|
}]'''
|
|
|
|
result = analyzer_with_mock_client._parse_claude_response(good_response, 1)
|
|
assert len(result) == 1
|
|
assert result[0]['topics'] == ["hvac_systems"]
|
|
|
|
# Test with missing required fields
|
|
incomplete_response = '''[{
|
|
"topics": ["hvac_systems"]
|
|
}]'''
|
|
|
|
result = analyzer_with_mock_client._parse_claude_response(incomplete_response, 1)
|
|
# Should fall back to default structure
|
|
assert len(result) == 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v", "--cov=src.content_analysis.claude_analyzer", "--cov-report=term-missing"]) |