Major enhancements to HKIA content analysis system: CRITICAL FIXES: • Fix engagement data parsing from markdown (Views/Likes/Comments now extracted correctly) • YouTube: 18.75% engagement rate working (16 views, 2 likes, 1 comment) • Instagram: 7.37% average engagement rate across 20 posts • High performer detection operational (1 YouTube + 20 Instagram above thresholds) CONTENT ANALYSIS SYSTEM: • Add Claude Haiku analyzer for HVAC content classification • Add engagement analyzer with source-specific algorithms • Add keyword extractor with 100+ HVAC-specific terms • Add intelligence aggregator for daily JSON reports • Add comprehensive unit test suite (73 tests, 90% coverage target) ARCHITECTURE: • Extend BaseScraper with optional AI analysis capabilities • Add content analysis orchestrator with CLI interface • Add competitive intelligence module structure • Maintain backward compatibility with existing scrapers INTELLIGENCE FEATURES: • Daily intelligence reports with strategic insights • Trending keyword analysis (813 refrigeration, 701 service mentions) • Content opportunity identification • Multi-source engagement benchmarking • HVAC-specific topic and product categorization PRODUCTION READY: • Claude Haiku API integration validated ($15-25/month estimated) • Graceful degradation when API unavailable • Comprehensive logging and error handling • State management for analytics tracking Ready for Phase 2: Competitive Intelligence Infrastructure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
396 lines
No EOL
16 KiB
Python
396 lines
No EOL
16 KiB
Python
"""
|
|
Analytics Base Scraper
|
|
|
|
Extends BaseScraper with content analysis capabilities using Claude Haiku,
|
|
engagement analysis, and keyword extraction.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
from datetime import datetime
|
|
|
|
from .base_scraper import BaseScraper, ScraperConfig
|
|
from .content_analysis import ClaudeHaikuAnalyzer, EngagementAnalyzer, KeywordExtractor
|
|
|
|
|
|
class AnalyticsBaseScraper(BaseScraper):
|
|
"""Enhanced BaseScraper with AI-powered content analysis"""
|
|
|
|
def __init__(self, config: ScraperConfig, enable_analysis: bool = True):
|
|
"""Initialize analytics scraper with content analysis capabilities"""
|
|
|
|
super().__init__(config)
|
|
|
|
self.enable_analysis = enable_analysis
|
|
|
|
# Initialize analyzers if enabled
|
|
if self.enable_analysis:
|
|
try:
|
|
self.claude_analyzer = ClaudeHaikuAnalyzer()
|
|
self.engagement_analyzer = EngagementAnalyzer()
|
|
self.keyword_extractor = KeywordExtractor()
|
|
|
|
self.logger.info("Content analysis enabled with Claude Haiku")
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Content analysis disabled due to error: {e}")
|
|
self.enable_analysis = False
|
|
|
|
# Analytics state file
|
|
self.analytics_state_file = (
|
|
config.data_dir / ".state" / f"{config.source_name}_analytics_state.json"
|
|
)
|
|
self.analytics_state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
def fetch_content_with_analysis(self, **kwargs) -> List[Dict[str, Any]]:
|
|
"""Fetch content and perform analysis"""
|
|
|
|
# Fetch content using the original scraper method
|
|
content_items = self.fetch_content(**kwargs)
|
|
|
|
if not content_items or not self.enable_analysis:
|
|
return content_items
|
|
|
|
self.logger.info(f"Analyzing {len(content_items)} content items with AI")
|
|
|
|
# Perform content analysis
|
|
analyzed_items = []
|
|
|
|
for item in content_items:
|
|
try:
|
|
analyzed_item = self._analyze_content_item(item)
|
|
analyzed_items.append(analyzed_item)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error analyzing item {item.get('id')}: {e}")
|
|
# Include original item without analysis
|
|
analyzed_items.append(item)
|
|
|
|
# Update analytics state
|
|
self._update_analytics_state(analyzed_items)
|
|
|
|
return analyzed_items
|
|
|
|
def _analyze_content_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Analyze a single content item with AI"""
|
|
|
|
analyzed_item = item.copy()
|
|
|
|
try:
|
|
# Content classification with Claude Haiku
|
|
content_analysis = self.claude_analyzer.analyze_content(item)
|
|
|
|
# Add analysis results to item
|
|
analyzed_item['ai_analysis'] = {
|
|
'topics': content_analysis.topics,
|
|
'products': content_analysis.products,
|
|
'difficulty': content_analysis.difficulty,
|
|
'content_type': content_analysis.content_type,
|
|
'sentiment': content_analysis.sentiment,
|
|
'keywords': content_analysis.keywords,
|
|
'hvac_relevance': content_analysis.hvac_relevance,
|
|
'engagement_prediction': content_analysis.engagement_prediction,
|
|
'analyzed_at': datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Claude analysis failed for {item.get('id')}: {e}")
|
|
analyzed_item['ai_analysis'] = {
|
|
'error': str(e),
|
|
'analyzed_at': datetime.now().isoformat()
|
|
}
|
|
|
|
try:
|
|
# Keyword extraction
|
|
keyword_analysis = self.keyword_extractor.extract_keywords(item)
|
|
|
|
analyzed_item['keyword_analysis'] = {
|
|
'primary_keywords': keyword_analysis.primary_keywords,
|
|
'technical_terms': keyword_analysis.technical_terms,
|
|
'product_keywords': keyword_analysis.product_keywords,
|
|
'seo_keywords': keyword_analysis.seo_keywords,
|
|
'keyword_density': keyword_analysis.keyword_density
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Keyword extraction failed for {item.get('id')}: {e}")
|
|
analyzed_item['keyword_analysis'] = {'error': str(e)}
|
|
|
|
return analyzed_item
|
|
|
|
def calculate_engagement_metrics(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Calculate engagement metrics for content items"""
|
|
|
|
if not self.enable_analysis or not items:
|
|
return {}
|
|
|
|
try:
|
|
# Analyze engagement patterns
|
|
engagement_metrics = self.engagement_analyzer.analyze_engagement_metrics(
|
|
items, self.config.source_name
|
|
)
|
|
|
|
# Identify trending content
|
|
trending_content = self.engagement_analyzer.identify_trending_content(
|
|
items, self.config.source_name
|
|
)
|
|
|
|
# Calculate source summary
|
|
source_summary = self.engagement_analyzer.calculate_source_summary(
|
|
items, self.config.source_name
|
|
)
|
|
|
|
return {
|
|
'source_summary': source_summary,
|
|
'trending_content': [
|
|
{
|
|
'content_id': t.content_id,
|
|
'title': t.title,
|
|
'engagement_score': t.engagement_score,
|
|
'velocity_score': t.velocity_score,
|
|
'trend_type': t.trend_type
|
|
} for t in trending_content
|
|
],
|
|
'high_performers': [
|
|
{
|
|
'content_id': m.content_id,
|
|
'engagement_rate': m.engagement_rate,
|
|
'virality_score': m.virality_score,
|
|
'relative_performance': m.relative_performance
|
|
} for m in engagement_metrics if m.relative_performance > 1.5
|
|
]
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Engagement analysis failed: {e}")
|
|
return {'error': str(e)}
|
|
|
|
def identify_content_opportunities(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Identify content opportunities and gaps"""
|
|
|
|
if not self.enable_analysis or not items:
|
|
return {}
|
|
|
|
try:
|
|
# Extract trending keywords
|
|
trending_keywords = self.keyword_extractor.identify_trending_keywords(items)
|
|
|
|
# Analyze topic distribution
|
|
topics = []
|
|
difficulties = []
|
|
content_types = []
|
|
|
|
for item in items:
|
|
analysis = item.get('ai_analysis', {})
|
|
if 'topics' in analysis:
|
|
topics.extend(analysis['topics'])
|
|
if 'difficulty' in analysis:
|
|
difficulties.append(analysis['difficulty'])
|
|
if 'content_type' in analysis:
|
|
content_types.append(analysis['content_type'])
|
|
|
|
# Identify gaps
|
|
topic_counts = {}
|
|
for topic in topics:
|
|
topic_counts[topic] = topic_counts.get(topic, 0) + 1
|
|
|
|
difficulty_counts = {}
|
|
for difficulty in difficulties:
|
|
difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1
|
|
|
|
content_type_counts = {}
|
|
for content_type in content_types:
|
|
content_type_counts[content_type] = content_type_counts.get(content_type, 0) + 1
|
|
|
|
# Expected high-value topics for HVAC
|
|
expected_topics = [
|
|
'heat_pumps', 'troubleshooting', 'installation', 'maintenance',
|
|
'refrigerants', 'electrical', 'smart_hvac', 'tools'
|
|
]
|
|
|
|
content_gaps = [
|
|
topic for topic in expected_topics
|
|
if topic_counts.get(topic, 0) < 2
|
|
]
|
|
|
|
return {
|
|
'trending_keywords': [
|
|
{'keyword': kw, 'frequency': freq}
|
|
for kw, freq in trending_keywords[:10]
|
|
],
|
|
'topic_distribution': topic_counts,
|
|
'difficulty_distribution': difficulty_counts,
|
|
'content_type_distribution': content_type_counts,
|
|
'content_gaps': content_gaps,
|
|
'opportunities': [
|
|
f"Create more {gap.replace('_', ' ')} content"
|
|
for gap in content_gaps[:5]
|
|
]
|
|
}
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Content opportunity analysis failed: {e}")
|
|
return {'error': str(e)}
|
|
|
|
def format_analytics_markdown(self, items: List[Dict[str, Any]]) -> str:
|
|
"""Format content with analytics data as enhanced markdown"""
|
|
|
|
if not items:
|
|
return "No content items to format."
|
|
|
|
# Calculate analytics summary
|
|
engagement_metrics = self.calculate_engagement_metrics(items)
|
|
content_opportunities = self.identify_content_opportunities(items)
|
|
|
|
# Build enhanced markdown
|
|
markdown_parts = []
|
|
|
|
# Analytics Summary Header
|
|
markdown_parts.append("# Content Analytics Summary")
|
|
markdown_parts.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
markdown_parts.append(f"Source: {self.config.source_name.title()}")
|
|
markdown_parts.append(f"Total Items: {len(items)}")
|
|
|
|
if self.enable_analysis:
|
|
markdown_parts.append(f"AI Analysis: Enabled (Claude Haiku)")
|
|
else:
|
|
markdown_parts.append(f"AI Analysis: Disabled")
|
|
|
|
markdown_parts.append("\n---\n")
|
|
|
|
# Engagement Summary
|
|
if engagement_metrics and 'source_summary' in engagement_metrics:
|
|
summary = engagement_metrics['source_summary']
|
|
markdown_parts.append("## Engagement Summary")
|
|
markdown_parts.append(f"- Average Engagement Rate: {summary.get('avg_engagement_rate', 0):.4f}")
|
|
markdown_parts.append(f"- Total Engagement: {summary.get('total_engagement', 0):,}")
|
|
markdown_parts.append(f"- Trending Items: {summary.get('trending_count', 0)}")
|
|
markdown_parts.append(f"- High Performers: {summary.get('high_performers', 0)}")
|
|
markdown_parts.append("")
|
|
|
|
# Content Opportunities
|
|
if content_opportunities and 'opportunities' in content_opportunities:
|
|
markdown_parts.append("## Content Opportunities")
|
|
for opp in content_opportunities['opportunities'][:5]:
|
|
markdown_parts.append(f"- {opp}")
|
|
markdown_parts.append("")
|
|
|
|
# Trending Keywords
|
|
if content_opportunities and 'trending_keywords' in content_opportunities:
|
|
keywords = content_opportunities['trending_keywords'][:5]
|
|
if keywords:
|
|
markdown_parts.append("## Trending Keywords")
|
|
for kw_data in keywords:
|
|
markdown_parts.append(f"- {kw_data['keyword']} ({kw_data['frequency']} mentions)")
|
|
markdown_parts.append("")
|
|
|
|
markdown_parts.append("\n---\n")
|
|
|
|
# Individual Content Items
|
|
for i, item in enumerate(items, 1):
|
|
markdown_parts.append(self._format_analyzed_item(item, i))
|
|
|
|
return '\n'.join(markdown_parts)
|
|
|
|
def _format_analyzed_item(self, item: Dict[str, Any], index: int) -> str:
|
|
"""Format individual analyzed content item as markdown"""
|
|
|
|
parts = []
|
|
|
|
# Basic item info
|
|
parts.append(f"# ID: {item.get('id', f'item_{index}')}")
|
|
|
|
if title := item.get('title'):
|
|
parts.append(f"## Title: {title}")
|
|
|
|
if item.get('type'):
|
|
parts.append(f"## Type: {item.get('type')}")
|
|
|
|
if item.get('author'):
|
|
parts.append(f"## Author: {item.get('author')}")
|
|
|
|
# AI Analysis Results
|
|
if ai_analysis := item.get('ai_analysis'):
|
|
if 'error' not in ai_analysis:
|
|
parts.append("## AI Analysis")
|
|
|
|
if topics := ai_analysis.get('topics'):
|
|
parts.append(f"**Topics**: {', '.join(topics)}")
|
|
|
|
if products := ai_analysis.get('products'):
|
|
parts.append(f"**Products**: {', '.join(products)}")
|
|
|
|
parts.append(f"**Difficulty**: {ai_analysis.get('difficulty', 'Unknown')}")
|
|
parts.append(f"**Content Type**: {ai_analysis.get('content_type', 'Unknown')}")
|
|
parts.append(f"**Sentiment**: {ai_analysis.get('sentiment', 0):.2f}")
|
|
parts.append(f"**HVAC Relevance**: {ai_analysis.get('hvac_relevance', 0):.2f}")
|
|
parts.append(f"**Engagement Prediction**: {ai_analysis.get('engagement_prediction', 0):.2f}")
|
|
|
|
if keywords := ai_analysis.get('keywords'):
|
|
parts.append(f"**Keywords**: {', '.join(keywords)}")
|
|
|
|
parts.append("")
|
|
|
|
# Keyword Analysis
|
|
if keyword_analysis := item.get('keyword_analysis'):
|
|
if 'error' not in keyword_analysis:
|
|
if seo_keywords := keyword_analysis.get('seo_keywords'):
|
|
parts.append(f"**SEO Keywords**: {', '.join(seo_keywords)}")
|
|
|
|
if technical_terms := keyword_analysis.get('technical_terms'):
|
|
parts.append(f"**Technical Terms**: {', '.join(technical_terms[:5])}")
|
|
|
|
parts.append("")
|
|
|
|
# Original content fields
|
|
original_markdown = self.format_markdown([item])
|
|
|
|
# Extract content after the first header
|
|
if '\n## ' in original_markdown:
|
|
content_start = original_markdown.find('\n## ')
|
|
original_content = original_markdown[content_start:]
|
|
parts.append(original_content)
|
|
|
|
parts.append("\n" + "="*80 + "\n")
|
|
|
|
return '\n'.join(parts)
|
|
|
|
def _update_analytics_state(self, analyzed_items: List[Dict[str, Any]]) -> None:
|
|
"""Update analytics state with analysis results"""
|
|
|
|
try:
|
|
# Load existing state
|
|
analytics_state = {}
|
|
if self.analytics_state_file.exists():
|
|
with open(self.analytics_state_file, 'r', encoding='utf-8') as f:
|
|
analytics_state = json.load(f)
|
|
|
|
# Update with current analysis
|
|
analytics_state.update({
|
|
'last_analysis_run': datetime.now().isoformat(),
|
|
'items_analyzed': len(analyzed_items),
|
|
'analysis_enabled': self.enable_analysis,
|
|
'total_items_analyzed': analytics_state.get('total_items_analyzed', 0) + len(analyzed_items)
|
|
})
|
|
|
|
# Save updated state
|
|
with open(self.analytics_state_file, 'w', encoding='utf-8') as f:
|
|
json.dump(analytics_state, f, indent=2)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error updating analytics state: {e}")
|
|
|
|
def get_analytics_state(self) -> Dict[str, Any]:
|
|
"""Get current analytics state"""
|
|
|
|
if not self.analytics_state_file.exists():
|
|
return {}
|
|
|
|
try:
|
|
with open(self.analytics_state_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
self.logger.error(f"Error reading analytics state: {e}")
|
|
return {} |