hvac-kia-content/src/analytics_base_scraper.py
Ben Reed ade81beea2 feat: Complete Phase 1 content analysis with engagement parsing fixes
Major enhancements to HKIA content analysis system:

CRITICAL FIXES:
• Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly)
• YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment)
• Instagram: 7.37% average engagement rate across 20 posts
• High performer detection operational (1 YouTube + 20 Instagram above thresholds)

CONTENT ANALYSIS SYSTEM:
• Add Claude Haiku analyzer for HVAC content classification
• Add engagement analyzer with source-specific algorithms
• Add keyword extractor with 100+ HVAC-specific terms
• Add intelligence aggregator for daily JSON reports
• Add comprehensive unit test suite (73 tests, 90% coverage target)

ARCHITECTURE:
• Extend BaseScraper with optional AI analysis capabilities
• Add content analysis orchestrator with CLI interface
• Add competitive intelligence module structure
• Maintain backward compatibility with existing scrapers

INTELLIGENCE FEATURES:
• Daily intelligence reports with strategic insights
• Trending keyword analysis (813 refrigeration, 701 service mentions)
• Content opportunity identification
• Multi-source engagement benchmarking
• HVAC-specific topic and product categorization

PRODUCTION READY:
• Claude Haiku API integration validated ($15-25/month estimated)
• Graceful degradation when API unavailable
• Comprehensive logging and error handling
• State management for analytics tracking

Ready for Phase 2: Competitive Intelligence Infrastructure

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-28 16:40:19 -03:00

396 lines
No EOL
16 KiB
Python

"""
Analytics Base Scraper
Extends BaseScraper with content analysis capabilities using Claude Haiku,
engagement analysis, and keyword extraction.
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Any, Optional
from datetime import datetime
from .base_scraper import BaseScraper, ScraperConfig
from .content_analysis import ClaudeHaikuAnalyzer, EngagementAnalyzer, KeywordExtractor
class AnalyticsBaseScraper(BaseScraper):
"""Enhanced BaseScraper with AI-powered content analysis"""
def __init__(self, config: ScraperConfig, enable_analysis: bool = True):
"""Initialize analytics scraper with content analysis capabilities"""
super().__init__(config)
self.enable_analysis = enable_analysis
# Initialize analyzers if enabled
if self.enable_analysis:
try:
self.claude_analyzer = ClaudeHaikuAnalyzer()
self.engagement_analyzer = EngagementAnalyzer()
self.keyword_extractor = KeywordExtractor()
self.logger.info("Content analysis enabled with Claude Haiku")
except Exception as e:
self.logger.warning(f"Content analysis disabled due to error: {e}")
self.enable_analysis = False
# Analytics state file
self.analytics_state_file = (
config.data_dir / ".state" / f"{config.source_name}_analytics_state.json"
)
self.analytics_state_file.parent.mkdir(parents=True, exist_ok=True)
def fetch_content_with_analysis(self, **kwargs) -> List[Dict[str, Any]]:
"""Fetch content and perform analysis"""
# Fetch content using the original scraper method
content_items = self.fetch_content(**kwargs)
if not content_items or not self.enable_analysis:
return content_items
self.logger.info(f"Analyzing {len(content_items)} content items with AI")
# Perform content analysis
analyzed_items = []
for item in content_items:
try:
analyzed_item = self._analyze_content_item(item)
analyzed_items.append(analyzed_item)
except Exception as e:
self.logger.error(f"Error analyzing item {item.get('id')}: {e}")
# Include original item without analysis
analyzed_items.append(item)
# Update analytics state
self._update_analytics_state(analyzed_items)
return analyzed_items
def _analyze_content_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze a single content item with AI"""
analyzed_item = item.copy()
try:
# Content classification with Claude Haiku
content_analysis = self.claude_analyzer.analyze_content(item)
# Add analysis results to item
analyzed_item['ai_analysis'] = {
'topics': content_analysis.topics,
'products': content_analysis.products,
'difficulty': content_analysis.difficulty,
'content_type': content_analysis.content_type,
'sentiment': content_analysis.sentiment,
'keywords': content_analysis.keywords,
'hvac_relevance': content_analysis.hvac_relevance,
'engagement_prediction': content_analysis.engagement_prediction,
'analyzed_at': datetime.now().isoformat()
}
except Exception as e:
self.logger.error(f"Claude analysis failed for {item.get('id')}: {e}")
analyzed_item['ai_analysis'] = {
'error': str(e),
'analyzed_at': datetime.now().isoformat()
}
try:
# Keyword extraction
keyword_analysis = self.keyword_extractor.extract_keywords(item)
analyzed_item['keyword_analysis'] = {
'primary_keywords': keyword_analysis.primary_keywords,
'technical_terms': keyword_analysis.technical_terms,
'product_keywords': keyword_analysis.product_keywords,
'seo_keywords': keyword_analysis.seo_keywords,
'keyword_density': keyword_analysis.keyword_density
}
except Exception as e:
self.logger.error(f"Keyword extraction failed for {item.get('id')}: {e}")
analyzed_item['keyword_analysis'] = {'error': str(e)}
return analyzed_item
def calculate_engagement_metrics(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Calculate engagement metrics for content items"""
if not self.enable_analysis or not items:
return {}
try:
# Analyze engagement patterns
engagement_metrics = self.engagement_analyzer.analyze_engagement_metrics(
items, self.config.source_name
)
# Identify trending content
trending_content = self.engagement_analyzer.identify_trending_content(
items, self.config.source_name
)
# Calculate source summary
source_summary = self.engagement_analyzer.calculate_source_summary(
items, self.config.source_name
)
return {
'source_summary': source_summary,
'trending_content': [
{
'content_id': t.content_id,
'title': t.title,
'engagement_score': t.engagement_score,
'velocity_score': t.velocity_score,
'trend_type': t.trend_type
} for t in trending_content
],
'high_performers': [
{
'content_id': m.content_id,
'engagement_rate': m.engagement_rate,
'virality_score': m.virality_score,
'relative_performance': m.relative_performance
} for m in engagement_metrics if m.relative_performance > 1.5
]
}
except Exception as e:
self.logger.error(f"Engagement analysis failed: {e}")
return {'error': str(e)}
def identify_content_opportunities(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Identify content opportunities and gaps"""
if not self.enable_analysis or not items:
return {}
try:
# Extract trending keywords
trending_keywords = self.keyword_extractor.identify_trending_keywords(items)
# Analyze topic distribution
topics = []
difficulties = []
content_types = []
for item in items:
analysis = item.get('ai_analysis', {})
if 'topics' in analysis:
topics.extend(analysis['topics'])
if 'difficulty' in analysis:
difficulties.append(analysis['difficulty'])
if 'content_type' in analysis:
content_types.append(analysis['content_type'])
# Identify gaps
topic_counts = {}
for topic in topics:
topic_counts[topic] = topic_counts.get(topic, 0) + 1
difficulty_counts = {}
for difficulty in difficulties:
difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1
content_type_counts = {}
for content_type in content_types:
content_type_counts[content_type] = content_type_counts.get(content_type, 0) + 1
# Expected high-value topics for HVAC
expected_topics = [
'heat_pumps', 'troubleshooting', 'installation', 'maintenance',
'refrigerants', 'electrical', 'smart_hvac', 'tools'
]
content_gaps = [
topic for topic in expected_topics
if topic_counts.get(topic, 0) < 2
]
return {
'trending_keywords': [
{'keyword': kw, 'frequency': freq}
for kw, freq in trending_keywords[:10]
],
'topic_distribution': topic_counts,
'difficulty_distribution': difficulty_counts,
'content_type_distribution': content_type_counts,
'content_gaps': content_gaps,
'opportunities': [
f"Create more {gap.replace('_', ' ')} content"
for gap in content_gaps[:5]
]
}
except Exception as e:
self.logger.error(f"Content opportunity analysis failed: {e}")
return {'error': str(e)}
def format_analytics_markdown(self, items: List[Dict[str, Any]]) -> str:
"""Format content with analytics data as enhanced markdown"""
if not items:
return "No content items to format."
# Calculate analytics summary
engagement_metrics = self.calculate_engagement_metrics(items)
content_opportunities = self.identify_content_opportunities(items)
# Build enhanced markdown
markdown_parts = []
# Analytics Summary Header
markdown_parts.append("# Content Analytics Summary")
markdown_parts.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
markdown_parts.append(f"Source: {self.config.source_name.title()}")
markdown_parts.append(f"Total Items: {len(items)}")
if self.enable_analysis:
markdown_parts.append(f"AI Analysis: Enabled (Claude Haiku)")
else:
markdown_parts.append(f"AI Analysis: Disabled")
markdown_parts.append("\n---\n")
# Engagement Summary
if engagement_metrics and 'source_summary' in engagement_metrics:
summary = engagement_metrics['source_summary']
markdown_parts.append("## Engagement Summary")
markdown_parts.append(f"- Average Engagement Rate: {summary.get('avg_engagement_rate', 0):.4f}")
markdown_parts.append(f"- Total Engagement: {summary.get('total_engagement', 0):,}")
markdown_parts.append(f"- Trending Items: {summary.get('trending_count', 0)}")
markdown_parts.append(f"- High Performers: {summary.get('high_performers', 0)}")
markdown_parts.append("")
# Content Opportunities
if content_opportunities and 'opportunities' in content_opportunities:
markdown_parts.append("## Content Opportunities")
for opp in content_opportunities['opportunities'][:5]:
markdown_parts.append(f"- {opp}")
markdown_parts.append("")
# Trending Keywords
if content_opportunities and 'trending_keywords' in content_opportunities:
keywords = content_opportunities['trending_keywords'][:5]
if keywords:
markdown_parts.append("## Trending Keywords")
for kw_data in keywords:
markdown_parts.append(f"- {kw_data['keyword']} ({kw_data['frequency']} mentions)")
markdown_parts.append("")
markdown_parts.append("\n---\n")
# Individual Content Items
for i, item in enumerate(items, 1):
markdown_parts.append(self._format_analyzed_item(item, i))
return '\n'.join(markdown_parts)
def _format_analyzed_item(self, item: Dict[str, Any], index: int) -> str:
"""Format individual analyzed content item as markdown"""
parts = []
# Basic item info
parts.append(f"# ID: {item.get('id', f'item_{index}')}")
if title := item.get('title'):
parts.append(f"## Title: {title}")
if item.get('type'):
parts.append(f"## Type: {item.get('type')}")
if item.get('author'):
parts.append(f"## Author: {item.get('author')}")
# AI Analysis Results
if ai_analysis := item.get('ai_analysis'):
if 'error' not in ai_analysis:
parts.append("## AI Analysis")
if topics := ai_analysis.get('topics'):
parts.append(f"**Topics**: {', '.join(topics)}")
if products := ai_analysis.get('products'):
parts.append(f"**Products**: {', '.join(products)}")
parts.append(f"**Difficulty**: {ai_analysis.get('difficulty', 'Unknown')}")
parts.append(f"**Content Type**: {ai_analysis.get('content_type', 'Unknown')}")
parts.append(f"**Sentiment**: {ai_analysis.get('sentiment', 0):.2f}")
parts.append(f"**HVAC Relevance**: {ai_analysis.get('hvac_relevance', 0):.2f}")
parts.append(f"**Engagement Prediction**: {ai_analysis.get('engagement_prediction', 0):.2f}")
if keywords := ai_analysis.get('keywords'):
parts.append(f"**Keywords**: {', '.join(keywords)}")
parts.append("")
# Keyword Analysis
if keyword_analysis := item.get('keyword_analysis'):
if 'error' not in keyword_analysis:
if seo_keywords := keyword_analysis.get('seo_keywords'):
parts.append(f"**SEO Keywords**: {', '.join(seo_keywords)}")
if technical_terms := keyword_analysis.get('technical_terms'):
parts.append(f"**Technical Terms**: {', '.join(technical_terms[:5])}")
parts.append("")
# Original content fields
original_markdown = self.format_markdown([item])
# Extract content after the first header
if '\n## ' in original_markdown:
content_start = original_markdown.find('\n## ')
original_content = original_markdown[content_start:]
parts.append(original_content)
parts.append("\n" + "="*80 + "\n")
return '\n'.join(parts)
def _update_analytics_state(self, analyzed_items: List[Dict[str, Any]]) -> None:
"""Update analytics state with analysis results"""
try:
# Load existing state
analytics_state = {}
if self.analytics_state_file.exists():
with open(self.analytics_state_file, 'r', encoding='utf-8') as f:
analytics_state = json.load(f)
# Update with current analysis
analytics_state.update({
'last_analysis_run': datetime.now().isoformat(),
'items_analyzed': len(analyzed_items),
'analysis_enabled': self.enable_analysis,
'total_items_analyzed': analytics_state.get('total_items_analyzed', 0) + len(analyzed_items)
})
# Save updated state
with open(self.analytics_state_file, 'w', encoding='utf-8') as f:
json.dump(analytics_state, f, indent=2)
except Exception as e:
self.logger.error(f"Error updating analytics state: {e}")
def get_analytics_state(self) -> Dict[str, Any]:
"""Get current analytics state"""
if not self.analytics_state_file.exists():
return {}
try:
with open(self.analytics_state_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
self.logger.error(f"Error reading analytics state: {e}")
return {}