Major enhancements to HKIA content analysis system: CRITICAL FIXES: • Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly) • YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment) • Instagram: 7.37% average engagement rate across 20 posts • High performer detection operational (1 YouTube + 20 Instagram above thresholds) CONTENT ANALYSIS SYSTEM: • Add Claude Haiku analyzer for HVAC content classification • Add engagement analyzer with source-specific algorithms • Add keyword extractor with 100+ HVAC-specific terms • Add intelligence aggregator for daily JSON reports • Add comprehensive unit test suite (73 tests, 90% coverage target) ARCHITECTURE: • Extend BaseScraper with optional AI analysis capabilities • Add content analysis orchestrator with CLI interface • Add competitive intelligence module structure • Maintain backward compatibility with existing scrapers INTELLIGENCE FEATURES: • Daily intelligence reports with strategic insights • Trending keyword analysis (813 refrigeration, 701 service mentions) • Content opportunity identification • Multi-source engagement benchmarking • HVAC-specific topic and product categorization PRODUCTION READY: • Claude Haiku API integration validated ($15-25/month estimated) • Graceful degradation when API unavailable • Comprehensive logging and error handling • State management for analytics tracking Ready for Phase 2: Competitive Intelligence Infrastructure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			438 lines
		
	
	
		
			No EOL
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			438 lines
		
	
	
		
			No EOL
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| Comprehensive Unit Tests for Claude Haiku Analyzer
 | |
| 
 | |
| Tests Claude API integration, content classification,
 | |
| batch processing, and error handling.
 | |
| """
 | |
| 
 | |
| import pytest
 | |
| from unittest.mock import Mock, patch, MagicMock
 | |
| from pathlib import Path
 | |
| import sys
 | |
| 
 | |
| # Add src to path for imports
 | |
| if str(Path(__file__).parent.parent) not in sys.path:
 | |
|     sys.path.insert(0, str(Path(__file__).parent.parent))
 | |
| 
 | |
| from src.content_analysis.claude_analyzer import ClaudeHaikuAnalyzer
 | |
| 
 | |
| 
 | |
| class TestClaudeHaikuAnalyzer:
 | |
|     """Test suite for ClaudeHaikuAnalyzer"""
 | |
|     
 | |
|     @pytest.fixture
 | |
|     def mock_claude_client(self):
 | |
|         """Create mock Claude client"""
 | |
|         mock_client = Mock()
 | |
|         mock_response = Mock()
 | |
|         mock_response.content = [Mock()]
 | |
|         mock_response.content[0].text = """[
 | |
|             {
 | |
|                 "topics": ["hvac_systems", "installation"],
 | |
|                 "products": ["heat_pump"],
 | |
|                 "difficulty": "intermediate",
 | |
|                 "content_type": "tutorial",
 | |
|                 "sentiment": 0.7,
 | |
|                 "hvac_relevance": 0.9,
 | |
|                 "keywords": ["heat pump", "installation", "efficiency"]
 | |
|             }
 | |
|         ]"""
 | |
|         mock_client.messages.create.return_value = mock_response
 | |
|         return mock_client
 | |
|     
 | |
|     @pytest.fixture
 | |
|     def analyzer_with_mock_client(self, mock_claude_client):
 | |
|         """Create analyzer with mocked Claude client"""
 | |
|         with patch('src.content_analysis.claude_analyzer.anthropic.Anthropic') as mock_anthropic:
 | |
|             mock_anthropic.return_value = mock_claude_client
 | |
|             analyzer = ClaudeHaikuAnalyzer("test-api-key")
 | |
|             analyzer.client = mock_claude_client
 | |
|             return analyzer
 | |
|     
 | |
|     @pytest.fixture
 | |
|     def sample_content_items(self):
 | |
|         """Sample content items for testing"""
 | |
|         return [
 | |
|             {
 | |
|                 'id': 'item1',
 | |
|                 'title': 'Heat Pump Installation Guide',
 | |
|                 'content': 'Complete guide to installing high-efficiency heat pumps for residential applications.',
 | |
|                 'source': 'youtube'
 | |
|             },
 | |
|             {
 | |
|                 'id': 'item2', 
 | |
|                 'title': 'AC Troubleshooting',
 | |
|                 'content': 'Common air conditioning problems and how to diagnose compressor issues.',
 | |
|                 'source': 'blog'
 | |
|             },
 | |
|             {
 | |
|                 'id': 'item3',
 | |
|                 'title': 'Thermostat Wiring',
 | |
|                 'content': 'Step-by-step wiring instructions for smart thermostats and HVAC controls.',
 | |
|                 'source': 'instagram'
 | |
|             }
 | |
|         ]
 | |
| 
 | |
|     def test_initialization_with_api_key(self):
 | |
|         """Test analyzer initialization with API key"""
 | |
|         
 | |
|         with patch('src.content_analysis.claude_analyzer.anthropic.Anthropic') as mock_anthropic:
 | |
|             analyzer = ClaudeHaikuAnalyzer("test-api-key")
 | |
|             
 | |
|             assert analyzer.api_key == "test-api-key"
 | |
|             assert analyzer.model_name == "claude-3-haiku-20240307"
 | |
|             assert analyzer.max_tokens == 4000
 | |
|             assert analyzer.temperature == 0.1
 | |
|             mock_anthropic.assert_called_once_with(api_key="test-api-key")
 | |
| 
 | |
|     def test_initialization_without_api_key(self):
 | |
|         """Test analyzer initialization without API key raises error"""
 | |
|         
 | |
|         with pytest.raises(ValueError, match="ANTHROPIC_API_KEY is required"):
 | |
|             ClaudeHaikuAnalyzer(None)
 | |
| 
 | |
|     def test_analyze_single_content(self, analyzer_with_mock_client, sample_content_items):
 | |
|         """Test single content item analysis"""
 | |
|         
 | |
|         item = sample_content_items[0]
 | |
|         result = analyzer_with_mock_client.analyze_content(item)
 | |
|         
 | |
|         # Verify API call structure
 | |
|         analyzer_with_mock_client.client.messages.create.assert_called_once()
 | |
|         call_args = analyzer_with_mock_client.client.messages.create.call_args
 | |
|         
 | |
|         assert call_args[1]['model'] == "claude-3-haiku-20240307"
 | |
|         assert call_args[1]['max_tokens'] == 4000
 | |
|         assert call_args[1]['temperature'] == 0.1
 | |
|         
 | |
|         # Verify result structure
 | |
|         assert 'topics' in result
 | |
|         assert 'products' in result
 | |
|         assert 'difficulty' in result
 | |
|         assert 'content_type' in result
 | |
|         assert 'sentiment' in result
 | |
|         assert 'hvac_relevance' in result
 | |
|         assert 'keywords' in result
 | |
| 
 | |
|     def test_analyze_content_batch(self, analyzer_with_mock_client, sample_content_items):
 | |
|         """Test batch content analysis"""
 | |
|         
 | |
|         # Mock batch response
 | |
|         batch_response = Mock()
 | |
|         batch_response.content = [Mock()]
 | |
|         batch_response.content[0].text = """[
 | |
|             {
 | |
|                 "topics": ["hvac_systems"],
 | |
|                 "products": ["heat_pump"],
 | |
|                 "difficulty": "intermediate",
 | |
|                 "content_type": "tutorial",
 | |
|                 "sentiment": 0.7,
 | |
|                 "hvac_relevance": 0.9,
 | |
|                 "keywords": ["heat pump"]
 | |
|             },
 | |
|             {
 | |
|                 "topics": ["troubleshooting"],
 | |
|                 "products": ["air_conditioning"],
 | |
|                 "difficulty": "advanced",
 | |
|                 "content_type": "diagnostic",
 | |
|                 "sentiment": 0.5,
 | |
|                 "hvac_relevance": 0.8,
 | |
|                 "keywords": ["ac repair"]
 | |
|             },
 | |
|             {
 | |
|                 "topics": ["controls"],
 | |
|                 "products": ["thermostat"],
 | |
|                 "difficulty": "beginner",
 | |
|                 "content_type": "tutorial",
 | |
|                 "sentiment": 0.6,
 | |
|                 "hvac_relevance": 0.7,
 | |
|                 "keywords": ["thermostat wiring"]
 | |
|             }
 | |
|         ]"""
 | |
|         analyzer_with_mock_client.client.messages.create.return_value = batch_response
 | |
|         
 | |
|         results = analyzer_with_mock_client.analyze_content_batch(sample_content_items)
 | |
|         
 | |
|         assert len(results) == 3
 | |
|         
 | |
|         # Verify each result structure
 | |
|         for result in results:
 | |
|             assert 'topics' in result
 | |
|             assert 'products' in result
 | |
|             assert 'difficulty' in result
 | |
|             assert 'content_type' in result
 | |
|             assert 'sentiment' in result
 | |
|             assert 'hvac_relevance' in result
 | |
|             assert 'keywords' in result
 | |
| 
 | |
|     def test_batch_processing_chunking(self, analyzer_with_mock_client):
 | |
|         """Test batch processing with chunking for large item lists"""
 | |
|         
 | |
|         # Create large list of content items
 | |
|         large_content_list = []
 | |
|         for i in range(15):  # More than batch_size of 10
 | |
|             large_content_list.append({
 | |
|                 'id': f'item{i}',
 | |
|                 'title': f'HVAC Item {i}',
 | |
|                 'content': f'Content for item {i}',
 | |
|                 'source': 'test'
 | |
|             })
 | |
|         
 | |
|         # Mock responses for multiple batches
 | |
|         response1 = Mock()
 | |
|         response1.content = [Mock()]
 | |
|         response1.content[0].text = '[' + ','.join([
 | |
|             '{"topics": ["hvac_systems"], "products": [], "difficulty": "intermediate", "content_type": "tutorial", "sentiment": 0.5, "hvac_relevance": 0.8, "keywords": []}'
 | |
|         ] * 10) + ']'
 | |
|         
 | |
|         response2 = Mock() 
 | |
|         response2.content = [Mock()]
 | |
|         response2.content[0].text = '[' + ','.join([
 | |
|             '{"topics": ["maintenance"], "products": [], "difficulty": "beginner", "content_type": "guide", "sentiment": 0.6, "hvac_relevance": 0.7, "keywords": []}'
 | |
|         ] * 5) + ']'
 | |
|         
 | |
|         analyzer_with_mock_client.client.messages.create.side_effect = [response1, response2]
 | |
|         
 | |
|         results = analyzer_with_mock_client.analyze_content_batch(large_content_list)
 | |
|         
 | |
|         assert len(results) == 15
 | |
|         assert analyzer_with_mock_client.client.messages.create.call_count == 2
 | |
| 
 | |
|     def test_create_analysis_prompt_single(self, analyzer_with_mock_client, sample_content_items):
 | |
|         """Test analysis prompt creation for single item"""
 | |
|         
 | |
|         item = sample_content_items[0]
 | |
|         prompt = analyzer_with_mock_client._create_analysis_prompt([item])
 | |
|         
 | |
|         # Verify prompt contains expected elements
 | |
|         assert 'Heat Pump Installation Guide' in prompt
 | |
|         assert 'Complete guide to installing' in prompt
 | |
|         assert 'HVAC Content Analysis' in prompt
 | |
|         assert 'topics' in prompt
 | |
|         assert 'products' in prompt
 | |
|         assert 'difficulty' in prompt
 | |
| 
 | |
|     def test_create_analysis_prompt_batch(self, analyzer_with_mock_client, sample_content_items):
 | |
|         """Test analysis prompt creation for batch"""
 | |
|         
 | |
|         prompt = analyzer_with_mock_client._create_analysis_prompt(sample_content_items)
 | |
|         
 | |
|         # Should contain all items
 | |
|         assert 'Heat Pump Installation Guide' in prompt
 | |
|         assert 'AC Troubleshooting' in prompt  
 | |
|         assert 'Thermostat Wiring' in prompt
 | |
|         
 | |
|         # Should be structured as JSON array request
 | |
|         assert 'JSON array' in prompt
 | |
| 
 | |
|     def test_parse_claude_response_valid_json(self, analyzer_with_mock_client):
 | |
|         """Test parsing valid Claude JSON response"""
 | |
|         
 | |
|         response_text = """[
 | |
|             {
 | |
|                 "topics": ["hvac_systems"],
 | |
|                 "products": ["heat_pump"],
 | |
|                 "difficulty": "intermediate",
 | |
|                 "content_type": "tutorial", 
 | |
|                 "sentiment": 0.7,
 | |
|                 "hvac_relevance": 0.9,
 | |
|                 "keywords": ["heat pump", "installation"]
 | |
|             }
 | |
|         ]"""
 | |
|         
 | |
|         results = analyzer_with_mock_client._parse_claude_response(response_text, 1)
 | |
|         
 | |
|         assert len(results) == 1
 | |
|         assert results[0]['topics'] == ["hvac_systems"]
 | |
|         assert results[0]['products'] == ["heat_pump"]
 | |
|         assert results[0]['sentiment'] == 0.7
 | |
| 
 | |
|     def test_parse_claude_response_invalid_json(self, analyzer_with_mock_client):
 | |
|         """Test parsing invalid Claude JSON response"""
 | |
|         
 | |
|         invalid_json = "This is not valid JSON"
 | |
|         
 | |
|         results = analyzer_with_mock_client._parse_claude_response(invalid_json, 2)
 | |
|         
 | |
|         # Should return fallback results
 | |
|         assert len(results) == 2
 | |
|         for result in results:
 | |
|             assert result['topics'] == []
 | |
|             assert result['products'] == []
 | |
|             assert result['difficulty'] == 'unknown'
 | |
|             assert result['content_type'] == 'unknown'
 | |
|             assert result['sentiment'] == 0
 | |
|             assert result['hvac_relevance'] == 0
 | |
|             assert result['keywords'] == []
 | |
| 
 | |
|     def test_parse_claude_response_partial_json(self, analyzer_with_mock_client):
 | |
|         """Test parsing partially valid JSON response"""
 | |
|         
 | |
|         partial_json = """[
 | |
|             {
 | |
|                 "topics": ["hvac_systems"],
 | |
|                 "products": ["heat_pump"],
 | |
|                 "difficulty": "intermediate"
 | |
|                 // Missing some fields
 | |
|             }
 | |
|         ]"""
 | |
|         
 | |
|         results = analyzer_with_mock_client._parse_claude_response(partial_json, 1)
 | |
|         
 | |
|         # Should still get fallback for malformed JSON
 | |
|         assert len(results) == 1
 | |
|         assert results[0]['topics'] == []
 | |
| 
 | |
|     def test_create_fallback_analysis(self, analyzer_with_mock_client):
 | |
|         """Test fallback analysis creation"""
 | |
|         
 | |
|         fallback = analyzer_with_mock_client._create_fallback_analysis()
 | |
|         
 | |
|         assert fallback['topics'] == []
 | |
|         assert fallback['products'] == []
 | |
|         assert fallback['difficulty'] == 'unknown'
 | |
|         assert fallback['content_type'] == 'unknown'
 | |
|         assert fallback['sentiment'] == 0
 | |
|         assert fallback['hvac_relevance'] == 0
 | |
|         assert fallback['keywords'] == []
 | |
| 
 | |
|     def test_api_error_handling(self, analyzer_with_mock_client):
 | |
|         """Test API error handling"""
 | |
|         
 | |
|         # Mock API error
 | |
|         analyzer_with_mock_client.client.messages.create.side_effect = Exception("API Error")
 | |
|         
 | |
|         item = {'id': 'test', 'title': 'Test', 'content': 'Test content', 'source': 'test'}
 | |
|         result = analyzer_with_mock_client.analyze_content(item)
 | |
|         
 | |
|         # Should return fallback analysis
 | |
|         assert result['topics'] == []
 | |
|         assert result['difficulty'] == 'unknown'
 | |
| 
 | |
|     def test_rate_limiting_backoff(self, analyzer_with_mock_client):
 | |
|         """Test rate limiting and backoff behavior"""
 | |
|         
 | |
|         # Mock rate limiting error followed by success
 | |
|         rate_limit_error = Exception("Rate limit exceeded")
 | |
|         success_response = Mock()
 | |
|         success_response.content = [Mock()]
 | |
|         success_response.content[0].text = '[{"topics": [], "products": [], "difficulty": "unknown", "content_type": "unknown", "sentiment": 0, "hvac_relevance": 0, "keywords": []}]'
 | |
|         
 | |
|         analyzer_with_mock_client.client.messages.create.side_effect = [rate_limit_error, success_response]
 | |
|         
 | |
|         with patch('time.sleep') as mock_sleep:
 | |
|             item = {'id': 'test', 'title': 'Test', 'content': 'Test content', 'source': 'test'}
 | |
|             result = analyzer_with_mock_client.analyze_content(item)
 | |
|             
 | |
|             # Should have retried and succeeded
 | |
|             assert analyzer_with_mock_client.client.messages.create.call_count == 2
 | |
|             mock_sleep.assert_called_once()
 | |
| 
 | |
|     def test_empty_content_handling(self, analyzer_with_mock_client):
 | |
|         """Test handling of empty or minimal content"""
 | |
|         
 | |
|         empty_items = [
 | |
|             {'id': 'empty1', 'title': '', 'content': '', 'source': 'test'},
 | |
|             {'id': 'empty2', 'title': 'Title Only', 'source': 'test'}  # Missing content
 | |
|         ]
 | |
|         
 | |
|         results = analyzer_with_mock_client.analyze_content_batch(empty_items)
 | |
|         
 | |
|         # Should still process and return results
 | |
|         assert len(results) == 2
 | |
| 
 | |
|     def test_content_length_limits(self, analyzer_with_mock_client):
 | |
|         """Test handling of very long content"""
 | |
|         
 | |
|         long_content = {
 | |
|             'id': 'long1',
 | |
|             'title': 'Long Content Test',
 | |
|             'content': 'A' * 10000,  # Very long content
 | |
|             'source': 'test'
 | |
|         }
 | |
|         
 | |
|         # Should not crash with long content
 | |
|         result = analyzer_with_mock_client.analyze_content(long_content)
 | |
|         assert 'topics' in result
 | |
| 
 | |
|     def test_special_characters_handling(self, analyzer_with_mock_client):
 | |
|         """Test handling of special characters and encoding"""
 | |
|         
 | |
|         special_content = {
 | |
|             'id': 'special1',
 | |
|             'title': 'Special Characters: "Quotes" & Symbols ®™',
 | |
|             'content': 'Content with émojis 🔧 and speciál çharaçters',
 | |
|             'source': 'test'
 | |
|         }
 | |
|         
 | |
|         # Should handle special characters without errors
 | |
|         result = analyzer_with_mock_client.analyze_content(special_content)
 | |
|         assert 'topics' in result
 | |
| 
 | |
|     def test_taxonomy_validation(self, analyzer_with_mock_client):
 | |
|         """Test HVAC taxonomy validation in prompts"""
 | |
|         
 | |
|         item = {'id': 'test', 'title': 'Test', 'content': 'Test', 'source': 'test'}
 | |
|         prompt = analyzer_with_mock_client._create_analysis_prompt([item])
 | |
|         
 | |
|         # Should include HVAC topic categories
 | |
|         hvac_topics = ['hvac_systems', 'heat_pumps', 'air_conditioning', 'refrigeration', 
 | |
|                       'maintenance', 'installation', 'troubleshooting', 'controls']
 | |
|         for topic in hvac_topics:
 | |
|             assert topic in prompt
 | |
|         
 | |
|         # Should include product categories
 | |
|         hvac_products = ['heat_pump', 'air_conditioner', 'furnace', 'boiler', 'thermostat',
 | |
|                         'compressor', 'evaporator', 'condenser']
 | |
|         for product in hvac_products:
 | |
|             assert product in prompt
 | |
| 
 | |
|     def test_model_configuration_validation(self, analyzer_with_mock_client):
 | |
|         """Test model configuration parameters"""
 | |
|         
 | |
|         assert analyzer_with_mock_client.model_name == "claude-3-haiku-20240307"
 | |
|         assert analyzer_with_mock_client.max_tokens == 4000
 | |
|         assert analyzer_with_mock_client.temperature == 0.1
 | |
|         assert analyzer_with_mock_client.batch_size == 10
 | |
| 
 | |
|     @patch('src.content_analysis.claude_analyzer.logging')
 | |
|     def test_logging_functionality(self, mock_logging, analyzer_with_mock_client):
 | |
|         """Test logging of analysis operations"""
 | |
|         
 | |
|         item = {'id': 'test', 'title': 'Test', 'content': 'Test', 'source': 'test'}
 | |
|         analyzer_with_mock_client.analyze_content(item)
 | |
|         
 | |
|         # Should have logged the operation
 | |
|         assert mock_logging.getLogger.called
 | |
| 
 | |
|     def test_response_format_validation(self, analyzer_with_mock_client):
 | |
|         """Test validation of response format from Claude"""
 | |
|         
 | |
|         # Test with correctly formatted response
 | |
|         good_response = '''[{
 | |
|             "topics": ["hvac_systems"],
 | |
|             "products": ["heat_pump"], 
 | |
|             "difficulty": "intermediate",
 | |
|             "content_type": "tutorial",
 | |
|             "sentiment": 0.7,
 | |
|             "hvac_relevance": 0.9,
 | |
|             "keywords": ["heat pump"]
 | |
|         }]'''
 | |
|         
 | |
|         result = analyzer_with_mock_client._parse_claude_response(good_response, 1)
 | |
|         assert len(result) == 1
 | |
|         assert result[0]['topics'] == ["hvac_systems"]
 | |
|         
 | |
|         # Test with missing required fields
 | |
|         incomplete_response = '''[{
 | |
|             "topics": ["hvac_systems"]
 | |
|         }]'''
 | |
|         
 | |
|         result = analyzer_with_mock_client._parse_claude_response(incomplete_response, 1)
 | |
|         # Should fall back to default structure
 | |
|         assert len(result) == 1
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     pytest.main([__file__, "-v", "--cov=src.content_analysis.claude_analyzer", "--cov-report=term-missing"]) |