feat: Implement YouTube scraper with humanized behavior
- YouTube channel scraper using yt-dlp
- Authentication and session persistence via cookies
- Humanized delays and rate limiting (2-5 seconds between requests)
- User agent rotation for stealth
- Incremental updates via state management
- Support for videos, shorts, and live streams detection
- All 11 tests passing
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
7191fcd132
commit
c1831d3a52
2 changed files with 532 additions and 0 deletions
299
src/youtube_scraper.py
Normal file
299
src/youtube_scraper.py
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
import os
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import yt_dlp
|
||||
from src.base_scraper import BaseScraper, ScraperConfig
|
||||
|
||||
|
||||
class YouTubeScraper(BaseScraper):
|
||||
"""YouTube channel scraper using yt-dlp."""
|
||||
|
||||
def __init__(self, config: ScraperConfig):
|
||||
super().__init__(config)
|
||||
self.username = os.getenv('YOUTUBE_USERNAME')
|
||||
self.password = os.getenv('YOUTUBE_PASSWORD')
|
||||
self.channel_url = os.getenv('YOUTUBE_CHANNEL_URL', 'https://www.youtube.com/@HVACKnowItAll')
|
||||
|
||||
# Cookies file for session persistence
|
||||
self.cookies_file = self.config.data_dir / '.cookies' / 'youtube_cookies.txt'
|
||||
self.cookies_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# User agents for rotation
|
||||
self.user_agents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
]
|
||||
|
||||
def _get_ydl_options(self) -> Dict[str, Any]:
|
||||
"""Get yt-dlp options with authentication and rate limiting."""
|
||||
options = {
|
||||
'quiet': True,
|
||||
'no_warnings': True,
|
||||
'extract_flat': False, # Get full video info
|
||||
'ignoreerrors': True, # Continue on error
|
||||
'cookiefile': str(self.cookies_file),
|
||||
'cookiesfrombrowser': None, # Don't use browser cookies
|
||||
'username': self.username,
|
||||
'password': self.password,
|
||||
'ratelimit': 100000, # 100KB/s rate limit
|
||||
'sleep_interval': 1, # Sleep between downloads
|
||||
'max_sleep_interval': 3,
|
||||
'user_agent': random.choice(self.user_agents),
|
||||
'referer': 'https://www.youtube.com/',
|
||||
'add_header': ['Accept-Language:en-US,en;q=0.9'],
|
||||
}
|
||||
|
||||
# Add proxy if configured
|
||||
proxy = os.getenv('YOUTUBE_PROXY')
|
||||
if proxy:
|
||||
options['proxy'] = proxy
|
||||
|
||||
return options
|
||||
|
||||
def _humanized_delay(self, min_seconds: float = 2, max_seconds: float = 5) -> None:
|
||||
"""Add humanized random delay between requests."""
|
||||
delay = random.uniform(min_seconds, max_seconds)
|
||||
self.logger.debug(f"Waiting {delay:.2f} seconds...")
|
||||
time.sleep(delay)
|
||||
|
||||
def fetch_channel_videos(self, max_videos: int = 50) -> List[Dict[str, Any]]:
|
||||
"""Fetch video list from YouTube channel."""
|
||||
videos = []
|
||||
|
||||
try:
|
||||
self.logger.info(f"Fetching videos from channel: {self.channel_url}")
|
||||
|
||||
ydl_opts = self._get_ydl_options()
|
||||
ydl_opts['extract_flat'] = True # Just get video list, not full info
|
||||
ydl_opts['playlistend'] = max_videos
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
channel_info = ydl.extract_info(self.channel_url, download=False)
|
||||
|
||||
if 'entries' in channel_info:
|
||||
videos = list(channel_info['entries'])
|
||||
self.logger.info(f"Found {len(videos)} videos in channel")
|
||||
else:
|
||||
self.logger.warning("No entries found in channel info")
|
||||
|
||||
# Save cookies for next session
|
||||
if self.cookies_file.exists():
|
||||
self.logger.debug("Cookies saved for next session")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching channel videos: {e}")
|
||||
|
||||
return videos
|
||||
|
||||
def fetch_video_details(self, video_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Fetch detailed information for a specific video."""
|
||||
try:
|
||||
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
||||
|
||||
ydl_opts = self._get_ydl_options()
|
||||
ydl_opts['extract_flat'] = False # Get full video info
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
video_info = ydl.extract_info(video_url, download=False)
|
||||
return video_info
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching video {video_id}: {e}")
|
||||
return None
|
||||
|
||||
def _get_video_type(self, video: Dict[str, Any]) -> str:
|
||||
"""Determine video type (video, short, live)."""
|
||||
duration = video.get('duration', 0)
|
||||
is_live = video.get('is_live', False)
|
||||
|
||||
if is_live:
|
||||
return 'live'
|
||||
elif duration and duration < 60: # Less than 60 seconds
|
||||
return 'short'
|
||||
else:
|
||||
return 'video'
|
||||
|
||||
def fetch_content(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch and enrich video content with rate limiting."""
|
||||
# First get list of videos
|
||||
videos = self.fetch_channel_videos()
|
||||
|
||||
if not videos:
|
||||
return []
|
||||
|
||||
# Enrich each video with detailed information
|
||||
enriched_videos = []
|
||||
|
||||
for i, video in enumerate(videos):
|
||||
try:
|
||||
video_id = video.get('id')
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
self.logger.info(f"Fetching details for video {i+1}/{len(videos)}: {video_id}")
|
||||
|
||||
# Add humanized delay between requests
|
||||
if i > 0:
|
||||
self._humanized_delay()
|
||||
|
||||
# Fetch full video details
|
||||
detailed_info = self.fetch_video_details(video_id)
|
||||
|
||||
if detailed_info:
|
||||
# Add video type
|
||||
detailed_info['type'] = self._get_video_type(detailed_info)
|
||||
enriched_videos.append(detailed_info)
|
||||
|
||||
# Extra delay after every 5 videos
|
||||
if (i + 1) % 5 == 0:
|
||||
self.logger.info("Taking longer break after 5 videos...")
|
||||
self._humanized_delay(5, 10)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error enriching video {video.get('id')}: {e}")
|
||||
continue
|
||||
|
||||
self.logger.info(f"Successfully enriched {len(enriched_videos)} videos")
|
||||
return enriched_videos
|
||||
|
||||
def format_markdown(self, videos: List[Dict[str, Any]]) -> str:
|
||||
"""Format videos as markdown."""
|
||||
markdown_sections = []
|
||||
|
||||
for video in videos:
|
||||
section = []
|
||||
|
||||
# ID
|
||||
video_id = video.get('id', 'N/A')
|
||||
section.append(f"# ID: {video_id}")
|
||||
section.append("")
|
||||
|
||||
# Title
|
||||
title = video.get('title', 'Untitled')
|
||||
section.append(f"## Title: {title}")
|
||||
section.append("")
|
||||
|
||||
# Type
|
||||
video_type = video.get('type', self._get_video_type(video))
|
||||
section.append(f"## Type: {video_type}")
|
||||
section.append("")
|
||||
|
||||
# Author/Uploader
|
||||
author = video.get('uploader', 'Unknown')
|
||||
section.append(f"## Author: {author}")
|
||||
section.append("")
|
||||
|
||||
# Link
|
||||
link = video.get('webpage_url', f"https://www.youtube.com/watch?v={video_id}")
|
||||
section.append(f"## Link: {link}")
|
||||
section.append("")
|
||||
|
||||
# Upload Date
|
||||
upload_date = video.get('upload_date', '')
|
||||
if upload_date and len(upload_date) == 8: # YYYYMMDD format
|
||||
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}"
|
||||
section.append(f"## Upload Date: {formatted_date}")
|
||||
else:
|
||||
section.append(f"## Upload Date: {upload_date}")
|
||||
section.append("")
|
||||
|
||||
# Views
|
||||
view_count = video.get('view_count', 0)
|
||||
section.append(f"## Views: {view_count}")
|
||||
section.append("")
|
||||
|
||||
# Likes
|
||||
like_count = video.get('like_count', 0)
|
||||
section.append(f"## Likes: {like_count}")
|
||||
section.append("")
|
||||
|
||||
# Comments
|
||||
comment_count = video.get('comment_count', 0)
|
||||
section.append(f"## Comments: {comment_count}")
|
||||
section.append("")
|
||||
|
||||
# Duration
|
||||
duration = video.get('duration', 0)
|
||||
section.append(f"## Duration: {duration} seconds")
|
||||
section.append("")
|
||||
|
||||
# Tags
|
||||
tags = video.get('tags', [])
|
||||
if tags:
|
||||
tags_str = ', '.join(tags[:10]) # Limit to first 10 tags
|
||||
section.append(f"## Tags: {tags_str}")
|
||||
section.append("")
|
||||
|
||||
# Thumbnail
|
||||
thumbnail = video.get('thumbnail', '')
|
||||
if thumbnail:
|
||||
section.append(f"## Thumbnail: {thumbnail}")
|
||||
section.append("")
|
||||
|
||||
# Description
|
||||
section.append("## Description:")
|
||||
description = video.get('description', '')
|
||||
if description:
|
||||
# Limit description to first 500 characters
|
||||
if len(description) > 500:
|
||||
description = description[:500] + "..."
|
||||
section.append(description)
|
||||
section.append("")
|
||||
|
||||
# Separator
|
||||
section.append("-" * 50)
|
||||
section.append("")
|
||||
|
||||
markdown_sections.append('\n'.join(section))
|
||||
|
||||
return '\n'.join(markdown_sections)
|
||||
|
||||
def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Get only new videos since last sync."""
|
||||
if not state:
|
||||
return items
|
||||
|
||||
last_video_id = state.get('last_video_id')
|
||||
last_video_date = state.get('last_video_date')
|
||||
|
||||
if not last_video_id:
|
||||
return items
|
||||
|
||||
# Filter for videos newer than the last synced
|
||||
new_items = []
|
||||
for item in items:
|
||||
video_id = item.get('id')
|
||||
upload_date = item.get('upload_date', '')
|
||||
|
||||
# Check if this is a new video
|
||||
if video_id == last_video_id:
|
||||
break # Found the last synced video, stop here
|
||||
|
||||
# Also check by date as backup
|
||||
if upload_date and last_video_date and upload_date <= last_video_date:
|
||||
continue
|
||||
|
||||
new_items.append(item)
|
||||
|
||||
return new_items
|
||||
|
||||
def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Update state with latest video information."""
|
||||
if not items:
|
||||
return state
|
||||
|
||||
# Get the first item (most recent)
|
||||
latest_item = items[0]
|
||||
|
||||
state['last_video_id'] = latest_item.get('id')
|
||||
state['last_video_date'] = latest_item.get('upload_date')
|
||||
state['last_video_title'] = latest_item.get('title')
|
||||
state['last_sync'] = datetime.now(self.tz).isoformat()
|
||||
state['video_count'] = len(items)
|
||||
|
||||
return state
|
||||
233
tests/test_youtube_scraper.py
Normal file
233
tests/test_youtube_scraper.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
import pytest
|
||||
from unittest.mock import Mock, patch, MagicMock, call
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import random
|
||||
from src.youtube_scraper import YouTubeScraper
|
||||
from src.base_scraper import ScraperConfig
|
||||
|
||||
|
||||
class TestYouTubeScraper:
|
||||
@pytest.fixture
|
||||
def config(self):
|
||||
return ScraperConfig(
|
||||
source_name="youtube",
|
||||
brand_name="hvacknowitall",
|
||||
data_dir=Path("data"),
|
||||
logs_dir=Path("logs"),
|
||||
timezone="America/Halifax"
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_env(self):
|
||||
with patch.dict('os.environ', {
|
||||
'YOUTUBE_USERNAME': 'test@example.com',
|
||||
'YOUTUBE_PASSWORD': 'test_password',
|
||||
'YOUTUBE_CHANNEL_URL': 'https://www.youtube.com/@HVACKnowItAll'
|
||||
}):
|
||||
yield
|
||||
|
||||
@pytest.fixture
|
||||
def sample_video_info(self):
|
||||
return {
|
||||
'id': 'abc123',
|
||||
'title': 'HVAC Maintenance Tips',
|
||||
'description': 'Learn how to maintain your HVAC system',
|
||||
'uploader': 'HVAC Know It All',
|
||||
'upload_date': '20240101',
|
||||
'view_count': 1500,
|
||||
'like_count': 100,
|
||||
'comment_count': 25,
|
||||
'duration': 600,
|
||||
'webpage_url': 'https://www.youtube.com/watch?v=abc123',
|
||||
'thumbnail': 'https://i.ytimg.com/vi/abc123/maxresdefault.jpg',
|
||||
'tags': ['hvac', 'maintenance', 'tips']
|
||||
}
|
||||
|
||||
def test_initialization(self, config, mock_env):
|
||||
scraper = YouTubeScraper(config)
|
||||
assert scraper.config == config
|
||||
assert scraper.username == 'test@example.com'
|
||||
assert scraper.password == 'test_password'
|
||||
assert scraper.channel_url == 'https://www.youtube.com/@HVACKnowItAll'
|
||||
|
||||
@patch('yt_dlp.YoutubeDL')
|
||||
def test_setup_ydl_options(self, mock_ydl_class, config, mock_env):
|
||||
scraper = YouTubeScraper(config)
|
||||
options = scraper._get_ydl_options()
|
||||
|
||||
# Check key options
|
||||
assert options['quiet'] == True
|
||||
assert options['no_warnings'] == True
|
||||
assert options['extract_flat'] == False
|
||||
assert 'username' in options
|
||||
assert 'password' in options
|
||||
assert 'cookiefile' in options
|
||||
assert 'ratelimit' in options
|
||||
|
||||
@patch('yt_dlp.YoutubeDL')
|
||||
def test_fetch_channel_videos(self, mock_ydl_class, config, mock_env, sample_video_info):
|
||||
mock_ydl = MagicMock()
|
||||
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
||||
|
||||
# Mock channel info with videos
|
||||
mock_ydl.extract_info.return_value = {
|
||||
'entries': [
|
||||
sample_video_info,
|
||||
{**sample_video_info, 'id': 'def456', 'title': 'Another Video'}
|
||||
]
|
||||
}
|
||||
|
||||
scraper = YouTubeScraper(config)
|
||||
videos = scraper.fetch_channel_videos()
|
||||
|
||||
assert len(videos) == 2
|
||||
assert videos[0]['id'] == 'abc123'
|
||||
assert videos[1]['id'] == 'def456'
|
||||
mock_ydl.extract_info.assert_called_once()
|
||||
|
||||
@patch('yt_dlp.YoutubeDL')
|
||||
def test_fetch_video_details(self, mock_ydl_class, config, mock_env, sample_video_info):
|
||||
mock_ydl = MagicMock()
|
||||
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
||||
mock_ydl.extract_info.return_value = sample_video_info
|
||||
|
||||
scraper = YouTubeScraper(config)
|
||||
video_info = scraper.fetch_video_details('abc123')
|
||||
|
||||
assert video_info['id'] == 'abc123'
|
||||
assert video_info['title'] == 'HVAC Maintenance Tips'
|
||||
mock_ydl.extract_info.assert_called_with(
|
||||
'https://www.youtube.com/watch?v=abc123',
|
||||
download=False
|
||||
)
|
||||
|
||||
@patch('time.sleep')
|
||||
@patch('random.uniform')
|
||||
def test_humanized_delay(self, mock_uniform, mock_sleep, config, mock_env):
|
||||
mock_uniform.return_value = 3.5
|
||||
|
||||
scraper = YouTubeScraper(config)
|
||||
scraper._humanized_delay()
|
||||
|
||||
mock_uniform.assert_called_with(2, 5)
|
||||
mock_sleep.assert_called_with(3.5)
|
||||
|
||||
def test_format_video_type(self, config, mock_env):
|
||||
scraper = YouTubeScraper(config)
|
||||
|
||||
# Test short video
|
||||
assert scraper._get_video_type({'duration': 50}) == 'short'
|
||||
|
||||
# Test regular video
|
||||
assert scraper._get_video_type({'duration': 600}) == 'video'
|
||||
|
||||
# Test live stream
|
||||
assert scraper._get_video_type({'is_live': True}) == 'live'
|
||||
|
||||
# Test missing duration
|
||||
assert scraper._get_video_type({}) == 'video'
|
||||
|
||||
def test_format_markdown(self, config, mock_env):
|
||||
scraper = YouTubeScraper(config)
|
||||
|
||||
videos = [
|
||||
{
|
||||
'id': 'abc123',
|
||||
'title': 'HVAC Tips',
|
||||
'description': 'Learn HVAC basics',
|
||||
'uploader': 'HVAC Know It All',
|
||||
'upload_date': '20240101',
|
||||
'view_count': 1500,
|
||||
'like_count': 100,
|
||||
'comment_count': 25,
|
||||
'duration': 600,
|
||||
'webpage_url': 'https://www.youtube.com/watch?v=abc123',
|
||||
'tags': ['hvac', 'tips'],
|
||||
'type': 'video'
|
||||
}
|
||||
]
|
||||
|
||||
markdown = scraper.format_markdown(videos)
|
||||
|
||||
assert '# ID: abc123' in markdown
|
||||
assert '## Title: HVAC Tips' in markdown
|
||||
assert '## Type: video' in markdown
|
||||
assert '## Author: HVAC Know It All' in markdown
|
||||
assert '## Link: https://www.youtube.com/watch?v=abc123' in markdown
|
||||
assert '## Views: 1500' in markdown
|
||||
assert '## Likes: 100' in markdown
|
||||
assert '## Comments: 25' in markdown
|
||||
assert '## Duration: 600 seconds' in markdown
|
||||
assert '## Upload Date: 2024-01-01' in markdown
|
||||
assert '## Tags: hvac, tips' in markdown
|
||||
|
||||
def test_get_incremental_items(self, config, mock_env):
|
||||
scraper = YouTubeScraper(config)
|
||||
|
||||
videos = [
|
||||
{'id': 'video3', 'upload_date': '20240103'},
|
||||
{'id': 'video2', 'upload_date': '20240102'},
|
||||
{'id': 'video1', 'upload_date': '20240101'}
|
||||
]
|
||||
|
||||
# Test with no previous state
|
||||
state = {}
|
||||
new_videos = scraper.get_incremental_items(videos, state)
|
||||
assert len(new_videos) == 3
|
||||
|
||||
# Test with existing state
|
||||
state = {'last_video_id': 'video2', 'last_video_date': '20240102'}
|
||||
new_videos = scraper.get_incremental_items(videos, state)
|
||||
assert len(new_videos) == 1
|
||||
assert new_videos[0]['id'] == 'video3'
|
||||
|
||||
def test_update_state(self, config, mock_env):
|
||||
scraper = YouTubeScraper(config)
|
||||
|
||||
state = {}
|
||||
videos = [
|
||||
{'id': 'video2', 'upload_date': '20240102'},
|
||||
{'id': 'video1', 'upload_date': '20240101'}
|
||||
]
|
||||
|
||||
updated_state = scraper.update_state(state, videos)
|
||||
|
||||
assert updated_state['last_video_id'] == 'video2'
|
||||
assert updated_state['last_video_date'] == '20240102'
|
||||
assert updated_state['video_count'] == 2
|
||||
|
||||
@patch('yt_dlp.YoutubeDL')
|
||||
def test_error_handling(self, mock_ydl_class, config, mock_env):
|
||||
mock_ydl = MagicMock()
|
||||
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
||||
mock_ydl.extract_info.side_effect = Exception("Network error")
|
||||
|
||||
scraper = YouTubeScraper(config)
|
||||
videos = scraper.fetch_channel_videos()
|
||||
|
||||
assert videos == []
|
||||
|
||||
@patch('yt_dlp.YoutubeDL')
|
||||
@patch('time.sleep')
|
||||
def test_fetch_content_with_rate_limiting(self, mock_sleep, mock_ydl_class, config, mock_env, sample_video_info):
|
||||
mock_ydl = MagicMock()
|
||||
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
||||
|
||||
# Mock channel with multiple videos
|
||||
mock_ydl.extract_info.side_effect = [
|
||||
{'entries': [
|
||||
{'id': 'video1', 'title': 'Video 1'},
|
||||
{'id': 'video2', 'title': 'Video 2'}
|
||||
]},
|
||||
{**sample_video_info, 'id': 'video1'},
|
||||
{**sample_video_info, 'id': 'video2'}
|
||||
]
|
||||
|
||||
scraper = YouTubeScraper(config)
|
||||
with patch.object(scraper, '_humanized_delay') as mock_delay:
|
||||
videos = scraper.fetch_content()
|
||||
|
||||
assert len(videos) == 2
|
||||
# Check that delay was called between video fetches (once for second video)
|
||||
assert mock_delay.call_count >= 1
|
||||
Loading…
Reference in a new issue