hvac-kia-content/tests/test_claude_analyzer.py
Ben Reed ade81beea2 feat: Complete Phase 1 content analysis with engagement parsing fixes
Major enhancements to HKIA content analysis system:

CRITICAL FIXES:
• Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly)
• YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment)
• Instagram: 7.37% average engagement rate across 20 posts
• High performer detection operational (1 YouTube + 20 Instagram above thresholds)

CONTENT ANALYSIS SYSTEM:
• Add Claude Haiku analyzer for HVAC content classification
• Add engagement analyzer with source-specific algorithms
• Add keyword extractor with 100+ HVAC-specific terms
• Add intelligence aggregator for daily JSON reports
• Add comprehensive unit test suite (73 tests, 90% coverage target)

ARCHITECTURE:
• Extend BaseScraper with optional AI analysis capabilities
• Add content analysis orchestrator with CLI interface
• Add competitive intelligence module structure
• Maintain backward compatibility with existing scrapers

INTELLIGENCE FEATURES:
• Daily intelligence reports with strategic insights
• Trending keyword analysis (813 refrigeration, 701 service mentions)
• Content opportunity identification
• Multi-source engagement benchmarking
• HVAC-specific topic and product categorization

PRODUCTION READY:
• Claude Haiku API integration validated ($15-25/month estimated)
• Graceful degradation when API unavailable
• Comprehensive logging and error handling
• State management for analytics tracking

Ready for Phase 2: Competitive Intelligence Infrastructure

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-28 16:40:19 -03:00

438 lines
No EOL
17 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive Unit Tests for Claude Haiku Analyzer
Tests Claude API integration, content classification,
batch processing, and error handling.
"""
import pytest
from unittest.mock import Mock, patch, MagicMock
from pathlib import Path
import sys
# Add src to path for imports
if str(Path(__file__).parent.parent) not in sys.path:
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.content_analysis.claude_analyzer import ClaudeHaikuAnalyzer
class TestClaudeHaikuAnalyzer:
"""Test suite for ClaudeHaikuAnalyzer"""
@pytest.fixture
def mock_claude_client(self):
"""Create mock Claude client"""
mock_client = Mock()
mock_response = Mock()
mock_response.content = [Mock()]
mock_response.content[0].text = """[
{
"topics": ["hvac_systems", "installation"],
"products": ["heat_pump"],
"difficulty": "intermediate",
"content_type": "tutorial",
"sentiment": 0.7,
"hvac_relevance": 0.9,
"keywords": ["heat pump", "installation", "efficiency"]
}
]"""
mock_client.messages.create.return_value = mock_response
return mock_client
@pytest.fixture
def analyzer_with_mock_client(self, mock_claude_client):
"""Create analyzer with mocked Claude client"""
with patch('src.content_analysis.claude_analyzer.anthropic.Anthropic') as mock_anthropic:
mock_anthropic.return_value = mock_claude_client
analyzer = ClaudeHaikuAnalyzer("test-api-key")
analyzer.client = mock_claude_client
return analyzer
@pytest.fixture
def sample_content_items(self):
"""Sample content items for testing"""
return [
{
'id': 'item1',
'title': 'Heat Pump Installation Guide',
'content': 'Complete guide to installing high-efficiency heat pumps for residential applications.',
'source': 'youtube'
},
{
'id': 'item2',
'title': 'AC Troubleshooting',
'content': 'Common air conditioning problems and how to diagnose compressor issues.',
'source': 'blog'
},
{
'id': 'item3',
'title': 'Thermostat Wiring',
'content': 'Step-by-step wiring instructions for smart thermostats and HVAC controls.',
'source': 'instagram'
}
]
def test_initialization_with_api_key(self):
"""Test analyzer initialization with API key"""
with patch('src.content_analysis.claude_analyzer.anthropic.Anthropic') as mock_anthropic:
analyzer = ClaudeHaikuAnalyzer("test-api-key")
assert analyzer.api_key == "test-api-key"
assert analyzer.model_name == "claude-3-haiku-20240307"
assert analyzer.max_tokens == 4000
assert analyzer.temperature == 0.1
mock_anthropic.assert_called_once_with(api_key="test-api-key")
def test_initialization_without_api_key(self):
"""Test analyzer initialization without API key raises error"""
with pytest.raises(ValueError, match="ANTHROPIC_API_KEY is required"):
ClaudeHaikuAnalyzer(None)
def test_analyze_single_content(self, analyzer_with_mock_client, sample_content_items):
"""Test single content item analysis"""
item = sample_content_items[0]
result = analyzer_with_mock_client.analyze_content(item)
# Verify API call structure
analyzer_with_mock_client.client.messages.create.assert_called_once()
call_args = analyzer_with_mock_client.client.messages.create.call_args
assert call_args[1]['model'] == "claude-3-haiku-20240307"
assert call_args[1]['max_tokens'] == 4000
assert call_args[1]['temperature'] == 0.1
# Verify result structure
assert 'topics' in result
assert 'products' in result
assert 'difficulty' in result
assert 'content_type' in result
assert 'sentiment' in result
assert 'hvac_relevance' in result
assert 'keywords' in result
def test_analyze_content_batch(self, analyzer_with_mock_client, sample_content_items):
"""Test batch content analysis"""
# Mock batch response
batch_response = Mock()
batch_response.content = [Mock()]
batch_response.content[0].text = """[
{
"topics": ["hvac_systems"],
"products": ["heat_pump"],
"difficulty": "intermediate",
"content_type": "tutorial",
"sentiment": 0.7,
"hvac_relevance": 0.9,
"keywords": ["heat pump"]
},
{
"topics": ["troubleshooting"],
"products": ["air_conditioning"],
"difficulty": "advanced",
"content_type": "diagnostic",
"sentiment": 0.5,
"hvac_relevance": 0.8,
"keywords": ["ac repair"]
},
{
"topics": ["controls"],
"products": ["thermostat"],
"difficulty": "beginner",
"content_type": "tutorial",
"sentiment": 0.6,
"hvac_relevance": 0.7,
"keywords": ["thermostat wiring"]
}
]"""
analyzer_with_mock_client.client.messages.create.return_value = batch_response
results = analyzer_with_mock_client.analyze_content_batch(sample_content_items)
assert len(results) == 3
# Verify each result structure
for result in results:
assert 'topics' in result
assert 'products' in result
assert 'difficulty' in result
assert 'content_type' in result
assert 'sentiment' in result
assert 'hvac_relevance' in result
assert 'keywords' in result
def test_batch_processing_chunking(self, analyzer_with_mock_client):
"""Test batch processing with chunking for large item lists"""
# Create large list of content items
large_content_list = []
for i in range(15): # More than batch_size of 10
large_content_list.append({
'id': f'item{i}',
'title': f'HVAC Item {i}',
'content': f'Content for item {i}',
'source': 'test'
})
# Mock responses for multiple batches
response1 = Mock()
response1.content = [Mock()]
response1.content[0].text = '[' + ','.join([
'{"topics": ["hvac_systems"], "products": [], "difficulty": "intermediate", "content_type": "tutorial", "sentiment": 0.5, "hvac_relevance": 0.8, "keywords": []}'
] * 10) + ']'
response2 = Mock()
response2.content = [Mock()]
response2.content[0].text = '[' + ','.join([
'{"topics": ["maintenance"], "products": [], "difficulty": "beginner", "content_type": "guide", "sentiment": 0.6, "hvac_relevance": 0.7, "keywords": []}'
] * 5) + ']'
analyzer_with_mock_client.client.messages.create.side_effect = [response1, response2]
results = analyzer_with_mock_client.analyze_content_batch(large_content_list)
assert len(results) == 15
assert analyzer_with_mock_client.client.messages.create.call_count == 2
def test_create_analysis_prompt_single(self, analyzer_with_mock_client, sample_content_items):
"""Test analysis prompt creation for single item"""
item = sample_content_items[0]
prompt = analyzer_with_mock_client._create_analysis_prompt([item])
# Verify prompt contains expected elements
assert 'Heat Pump Installation Guide' in prompt
assert 'Complete guide to installing' in prompt
assert 'HVAC Content Analysis' in prompt
assert 'topics' in prompt
assert 'products' in prompt
assert 'difficulty' in prompt
def test_create_analysis_prompt_batch(self, analyzer_with_mock_client, sample_content_items):
"""Test analysis prompt creation for batch"""
prompt = analyzer_with_mock_client._create_analysis_prompt(sample_content_items)
# Should contain all items
assert 'Heat Pump Installation Guide' in prompt
assert 'AC Troubleshooting' in prompt
assert 'Thermostat Wiring' in prompt
# Should be structured as JSON array request
assert 'JSON array' in prompt
def test_parse_claude_response_valid_json(self, analyzer_with_mock_client):
"""Test parsing valid Claude JSON response"""
response_text = """[
{
"topics": ["hvac_systems"],
"products": ["heat_pump"],
"difficulty": "intermediate",
"content_type": "tutorial",
"sentiment": 0.7,
"hvac_relevance": 0.9,
"keywords": ["heat pump", "installation"]
}
]"""
results = analyzer_with_mock_client._parse_claude_response(response_text, 1)
assert len(results) == 1
assert results[0]['topics'] == ["hvac_systems"]
assert results[0]['products'] == ["heat_pump"]
assert results[0]['sentiment'] == 0.7
def test_parse_claude_response_invalid_json(self, analyzer_with_mock_client):
"""Test parsing invalid Claude JSON response"""
invalid_json = "This is not valid JSON"
results = analyzer_with_mock_client._parse_claude_response(invalid_json, 2)
# Should return fallback results
assert len(results) == 2
for result in results:
assert result['topics'] == []
assert result['products'] == []
assert result['difficulty'] == 'unknown'
assert result['content_type'] == 'unknown'
assert result['sentiment'] == 0
assert result['hvac_relevance'] == 0
assert result['keywords'] == []
def test_parse_claude_response_partial_json(self, analyzer_with_mock_client):
"""Test parsing partially valid JSON response"""
partial_json = """[
{
"topics": ["hvac_systems"],
"products": ["heat_pump"],
"difficulty": "intermediate"
// Missing some fields
}
]"""
results = analyzer_with_mock_client._parse_claude_response(partial_json, 1)
# Should still get fallback for malformed JSON
assert len(results) == 1
assert results[0]['topics'] == []
def test_create_fallback_analysis(self, analyzer_with_mock_client):
"""Test fallback analysis creation"""
fallback = analyzer_with_mock_client._create_fallback_analysis()
assert fallback['topics'] == []
assert fallback['products'] == []
assert fallback['difficulty'] == 'unknown'
assert fallback['content_type'] == 'unknown'
assert fallback['sentiment'] == 0
assert fallback['hvac_relevance'] == 0
assert fallback['keywords'] == []
def test_api_error_handling(self, analyzer_with_mock_client):
"""Test API error handling"""
# Mock API error
analyzer_with_mock_client.client.messages.create.side_effect = Exception("API Error")
item = {'id': 'test', 'title': 'Test', 'content': 'Test content', 'source': 'test'}
result = analyzer_with_mock_client.analyze_content(item)
# Should return fallback analysis
assert result['topics'] == []
assert result['difficulty'] == 'unknown'
def test_rate_limiting_backoff(self, analyzer_with_mock_client):
"""Test rate limiting and backoff behavior"""
# Mock rate limiting error followed by success
rate_limit_error = Exception("Rate limit exceeded")
success_response = Mock()
success_response.content = [Mock()]
success_response.content[0].text = '[{"topics": [], "products": [], "difficulty": "unknown", "content_type": "unknown", "sentiment": 0, "hvac_relevance": 0, "keywords": []}]'
analyzer_with_mock_client.client.messages.create.side_effect = [rate_limit_error, success_response]
with patch('time.sleep') as mock_sleep:
item = {'id': 'test', 'title': 'Test', 'content': 'Test content', 'source': 'test'}
result = analyzer_with_mock_client.analyze_content(item)
# Should have retried and succeeded
assert analyzer_with_mock_client.client.messages.create.call_count == 2
mock_sleep.assert_called_once()
def test_empty_content_handling(self, analyzer_with_mock_client):
"""Test handling of empty or minimal content"""
empty_items = [
{'id': 'empty1', 'title': '', 'content': '', 'source': 'test'},
{'id': 'empty2', 'title': 'Title Only', 'source': 'test'} # Missing content
]
results = analyzer_with_mock_client.analyze_content_batch(empty_items)
# Should still process and return results
assert len(results) == 2
def test_content_length_limits(self, analyzer_with_mock_client):
"""Test handling of very long content"""
long_content = {
'id': 'long1',
'title': 'Long Content Test',
'content': 'A' * 10000, # Very long content
'source': 'test'
}
# Should not crash with long content
result = analyzer_with_mock_client.analyze_content(long_content)
assert 'topics' in result
def test_special_characters_handling(self, analyzer_with_mock_client):
"""Test handling of special characters and encoding"""
special_content = {
'id': 'special1',
'title': 'Special Characters: "Quotes" & Symbols ®™',
'content': 'Content with émojis 🔧 and speciál çharaçters',
'source': 'test'
}
# Should handle special characters without errors
result = analyzer_with_mock_client.analyze_content(special_content)
assert 'topics' in result
def test_taxonomy_validation(self, analyzer_with_mock_client):
"""Test HVAC taxonomy validation in prompts"""
item = {'id': 'test', 'title': 'Test', 'content': 'Test', 'source': 'test'}
prompt = analyzer_with_mock_client._create_analysis_prompt([item])
# Should include HVAC topic categories
hvac_topics = ['hvac_systems', 'heat_pumps', 'air_conditioning', 'refrigeration',
'maintenance', 'installation', 'troubleshooting', 'controls']
for topic in hvac_topics:
assert topic in prompt
# Should include product categories
hvac_products = ['heat_pump', 'air_conditioner', 'furnace', 'boiler', 'thermostat',
'compressor', 'evaporator', 'condenser']
for product in hvac_products:
assert product in prompt
def test_model_configuration_validation(self, analyzer_with_mock_client):
"""Test model configuration parameters"""
assert analyzer_with_mock_client.model_name == "claude-3-haiku-20240307"
assert analyzer_with_mock_client.max_tokens == 4000
assert analyzer_with_mock_client.temperature == 0.1
assert analyzer_with_mock_client.batch_size == 10
@patch('src.content_analysis.claude_analyzer.logging')
def test_logging_functionality(self, mock_logging, analyzer_with_mock_client):
"""Test logging of analysis operations"""
item = {'id': 'test', 'title': 'Test', 'content': 'Test', 'source': 'test'}
analyzer_with_mock_client.analyze_content(item)
# Should have logged the operation
assert mock_logging.getLogger.called
def test_response_format_validation(self, analyzer_with_mock_client):
"""Test validation of response format from Claude"""
# Test with correctly formatted response
good_response = '''[{
"topics": ["hvac_systems"],
"products": ["heat_pump"],
"difficulty": "intermediate",
"content_type": "tutorial",
"sentiment": 0.7,
"hvac_relevance": 0.9,
"keywords": ["heat pump"]
}]'''
result = analyzer_with_mock_client._parse_claude_response(good_response, 1)
assert len(result) == 1
assert result[0]['topics'] == ["hvac_systems"]
# Test with missing required fields
incomplete_response = '''[{
"topics": ["hvac_systems"]
}]'''
result = analyzer_with_mock_client._parse_claude_response(incomplete_response, 1)
# Should fall back to default structure
assert len(result) == 1
if __name__ == "__main__":
pytest.main([__file__, "-v", "--cov=src.content_analysis.claude_analyzer", "--cov-report=term-missing"])