- YouTube channel scraper using yt-dlp
- Authentication and session persistence via cookies
- Humanized delays and rate limiting (2-5 seconds between requests)
- User agent rotation for stealth
- Incremental updates via state management
- Support for videos, shorts, and live streams detection
- All 11 tests passing
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
233 lines
No EOL
8.3 KiB
Python
233 lines
No EOL
8.3 KiB
Python
import pytest
|
|
from unittest.mock import Mock, patch, MagicMock, call
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import random
|
|
from src.youtube_scraper import YouTubeScraper
|
|
from src.base_scraper import ScraperConfig
|
|
|
|
|
|
class TestYouTubeScraper:
|
|
@pytest.fixture
|
|
def config(self):
|
|
return ScraperConfig(
|
|
source_name="youtube",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
@pytest.fixture
|
|
def mock_env(self):
|
|
with patch.dict('os.environ', {
|
|
'YOUTUBE_USERNAME': 'test@example.com',
|
|
'YOUTUBE_PASSWORD': 'test_password',
|
|
'YOUTUBE_CHANNEL_URL': 'https://www.youtube.com/@HVACKnowItAll'
|
|
}):
|
|
yield
|
|
|
|
@pytest.fixture
|
|
def sample_video_info(self):
|
|
return {
|
|
'id': 'abc123',
|
|
'title': 'HVAC Maintenance Tips',
|
|
'description': 'Learn how to maintain your HVAC system',
|
|
'uploader': 'HVAC Know It All',
|
|
'upload_date': '20240101',
|
|
'view_count': 1500,
|
|
'like_count': 100,
|
|
'comment_count': 25,
|
|
'duration': 600,
|
|
'webpage_url': 'https://www.youtube.com/watch?v=abc123',
|
|
'thumbnail': 'https://i.ytimg.com/vi/abc123/maxresdefault.jpg',
|
|
'tags': ['hvac', 'maintenance', 'tips']
|
|
}
|
|
|
|
def test_initialization(self, config, mock_env):
|
|
scraper = YouTubeScraper(config)
|
|
assert scraper.config == config
|
|
assert scraper.username == 'test@example.com'
|
|
assert scraper.password == 'test_password'
|
|
assert scraper.channel_url == 'https://www.youtube.com/@HVACKnowItAll'
|
|
|
|
@patch('yt_dlp.YoutubeDL')
|
|
def test_setup_ydl_options(self, mock_ydl_class, config, mock_env):
|
|
scraper = YouTubeScraper(config)
|
|
options = scraper._get_ydl_options()
|
|
|
|
# Check key options
|
|
assert options['quiet'] == True
|
|
assert options['no_warnings'] == True
|
|
assert options['extract_flat'] == False
|
|
assert 'username' in options
|
|
assert 'password' in options
|
|
assert 'cookiefile' in options
|
|
assert 'ratelimit' in options
|
|
|
|
@patch('yt_dlp.YoutubeDL')
|
|
def test_fetch_channel_videos(self, mock_ydl_class, config, mock_env, sample_video_info):
|
|
mock_ydl = MagicMock()
|
|
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
|
|
|
# Mock channel info with videos
|
|
mock_ydl.extract_info.return_value = {
|
|
'entries': [
|
|
sample_video_info,
|
|
{**sample_video_info, 'id': 'def456', 'title': 'Another Video'}
|
|
]
|
|
}
|
|
|
|
scraper = YouTubeScraper(config)
|
|
videos = scraper.fetch_channel_videos()
|
|
|
|
assert len(videos) == 2
|
|
assert videos[0]['id'] == 'abc123'
|
|
assert videos[1]['id'] == 'def456'
|
|
mock_ydl.extract_info.assert_called_once()
|
|
|
|
@patch('yt_dlp.YoutubeDL')
|
|
def test_fetch_video_details(self, mock_ydl_class, config, mock_env, sample_video_info):
|
|
mock_ydl = MagicMock()
|
|
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
|
mock_ydl.extract_info.return_value = sample_video_info
|
|
|
|
scraper = YouTubeScraper(config)
|
|
video_info = scraper.fetch_video_details('abc123')
|
|
|
|
assert video_info['id'] == 'abc123'
|
|
assert video_info['title'] == 'HVAC Maintenance Tips'
|
|
mock_ydl.extract_info.assert_called_with(
|
|
'https://www.youtube.com/watch?v=abc123',
|
|
download=False
|
|
)
|
|
|
|
@patch('time.sleep')
|
|
@patch('random.uniform')
|
|
def test_humanized_delay(self, mock_uniform, mock_sleep, config, mock_env):
|
|
mock_uniform.return_value = 3.5
|
|
|
|
scraper = YouTubeScraper(config)
|
|
scraper._humanized_delay()
|
|
|
|
mock_uniform.assert_called_with(2, 5)
|
|
mock_sleep.assert_called_with(3.5)
|
|
|
|
def test_format_video_type(self, config, mock_env):
|
|
scraper = YouTubeScraper(config)
|
|
|
|
# Test short video
|
|
assert scraper._get_video_type({'duration': 50}) == 'short'
|
|
|
|
# Test regular video
|
|
assert scraper._get_video_type({'duration': 600}) == 'video'
|
|
|
|
# Test live stream
|
|
assert scraper._get_video_type({'is_live': True}) == 'live'
|
|
|
|
# Test missing duration
|
|
assert scraper._get_video_type({}) == 'video'
|
|
|
|
def test_format_markdown(self, config, mock_env):
|
|
scraper = YouTubeScraper(config)
|
|
|
|
videos = [
|
|
{
|
|
'id': 'abc123',
|
|
'title': 'HVAC Tips',
|
|
'description': 'Learn HVAC basics',
|
|
'uploader': 'HVAC Know It All',
|
|
'upload_date': '20240101',
|
|
'view_count': 1500,
|
|
'like_count': 100,
|
|
'comment_count': 25,
|
|
'duration': 600,
|
|
'webpage_url': 'https://www.youtube.com/watch?v=abc123',
|
|
'tags': ['hvac', 'tips'],
|
|
'type': 'video'
|
|
}
|
|
]
|
|
|
|
markdown = scraper.format_markdown(videos)
|
|
|
|
assert '# ID: abc123' in markdown
|
|
assert '## Title: HVAC Tips' in markdown
|
|
assert '## Type: video' in markdown
|
|
assert '## Author: HVAC Know It All' in markdown
|
|
assert '## Link: https://www.youtube.com/watch?v=abc123' in markdown
|
|
assert '## Views: 1500' in markdown
|
|
assert '## Likes: 100' in markdown
|
|
assert '## Comments: 25' in markdown
|
|
assert '## Duration: 600 seconds' in markdown
|
|
assert '## Upload Date: 2024-01-01' in markdown
|
|
assert '## Tags: hvac, tips' in markdown
|
|
|
|
def test_get_incremental_items(self, config, mock_env):
|
|
scraper = YouTubeScraper(config)
|
|
|
|
videos = [
|
|
{'id': 'video3', 'upload_date': '20240103'},
|
|
{'id': 'video2', 'upload_date': '20240102'},
|
|
{'id': 'video1', 'upload_date': '20240101'}
|
|
]
|
|
|
|
# Test with no previous state
|
|
state = {}
|
|
new_videos = scraper.get_incremental_items(videos, state)
|
|
assert len(new_videos) == 3
|
|
|
|
# Test with existing state
|
|
state = {'last_video_id': 'video2', 'last_video_date': '20240102'}
|
|
new_videos = scraper.get_incremental_items(videos, state)
|
|
assert len(new_videos) == 1
|
|
assert new_videos[0]['id'] == 'video3'
|
|
|
|
def test_update_state(self, config, mock_env):
|
|
scraper = YouTubeScraper(config)
|
|
|
|
state = {}
|
|
videos = [
|
|
{'id': 'video2', 'upload_date': '20240102'},
|
|
{'id': 'video1', 'upload_date': '20240101'}
|
|
]
|
|
|
|
updated_state = scraper.update_state(state, videos)
|
|
|
|
assert updated_state['last_video_id'] == 'video2'
|
|
assert updated_state['last_video_date'] == '20240102'
|
|
assert updated_state['video_count'] == 2
|
|
|
|
@patch('yt_dlp.YoutubeDL')
|
|
def test_error_handling(self, mock_ydl_class, config, mock_env):
|
|
mock_ydl = MagicMock()
|
|
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
|
mock_ydl.extract_info.side_effect = Exception("Network error")
|
|
|
|
scraper = YouTubeScraper(config)
|
|
videos = scraper.fetch_channel_videos()
|
|
|
|
assert videos == []
|
|
|
|
@patch('yt_dlp.YoutubeDL')
|
|
@patch('time.sleep')
|
|
def test_fetch_content_with_rate_limiting(self, mock_sleep, mock_ydl_class, config, mock_env, sample_video_info):
|
|
mock_ydl = MagicMock()
|
|
mock_ydl_class.return_value.__enter__.return_value = mock_ydl
|
|
|
|
# Mock channel with multiple videos
|
|
mock_ydl.extract_info.side_effect = [
|
|
{'entries': [
|
|
{'id': 'video1', 'title': 'Video 1'},
|
|
{'id': 'video2', 'title': 'Video 2'}
|
|
]},
|
|
{**sample_video_info, 'id': 'video1'},
|
|
{**sample_video_info, 'id': 'video2'}
|
|
]
|
|
|
|
scraper = YouTubeScraper(config)
|
|
with patch.object(scraper, '_humanized_delay') as mock_delay:
|
|
videos = scraper.fetch_content()
|
|
|
|
assert len(videos) == 2
|
|
# Check that delay was called between video fetches (once for second video)
|
|
assert mock_delay.call_count >= 1 |