- Set up UV environment with all required packages
- Created comprehensive project structure
- Implemented abstract BaseScraper class with TDD
- Added documentation (project spec, implementation plan, status)
- Configured .env for credentials (not committed)
- All base scraper tests passing (9/9)
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
175 lines
No EOL
6.6 KiB
Python
175 lines
No EOL
6.6 KiB
Python
import pytest
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
from datetime import datetime
|
|
import json
|
|
from pathlib import Path
|
|
from src.base_scraper import BaseScraper, ScraperConfig
|
|
|
|
|
|
class TestBaseScraper:
|
|
def test_scraper_config_initialization(self):
|
|
config = ScraperConfig(
|
|
source_name="test_source",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
assert config.source_name == "test_source"
|
|
assert config.brand_name == "hvacknowitall"
|
|
assert config.data_dir == Path("data")
|
|
assert config.logs_dir == Path("logs")
|
|
assert config.timezone == "America/Halifax"
|
|
|
|
def test_base_scraper_initialization(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
with patch.object(BaseScraper, '__abstractmethods__', set()):
|
|
scraper = BaseScraper(config)
|
|
assert scraper.config == config
|
|
assert scraper.state_file == Path("data") / ".state" / "test_state.json"
|
|
assert scraper.logger is not None
|
|
|
|
def test_load_state_creates_new_when_missing(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
with patch.object(BaseScraper, '__abstractmethods__', set()):
|
|
with patch('pathlib.Path.exists', return_value=False):
|
|
scraper = BaseScraper(config)
|
|
state = scraper.load_state()
|
|
assert state == {}
|
|
|
|
def test_load_state_reads_existing_file(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
expected_state = {"last_id": "123", "last_update": "2024-01-01"}
|
|
|
|
with patch.object(BaseScraper, '__abstractmethods__', set()):
|
|
with patch('pathlib.Path.exists', return_value=True):
|
|
with patch('builtins.open', create=True) as mock_open:
|
|
mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(expected_state)
|
|
scraper = BaseScraper(config)
|
|
state = scraper.load_state()
|
|
assert state == expected_state
|
|
|
|
def test_save_state(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
state_to_save = {"last_id": "456", "last_update": "2024-01-02"}
|
|
|
|
with patch.object(BaseScraper, '__abstractmethods__', set()):
|
|
scraper = BaseScraper(config)
|
|
|
|
with patch('builtins.open', create=True) as mock_open:
|
|
# Create a list to capture the written data
|
|
written_data = []
|
|
|
|
def write_side_effect(data):
|
|
written_data.append(data)
|
|
|
|
mock_file = MagicMock()
|
|
mock_file.write.side_effect = write_side_effect
|
|
mock_open.return_value.__enter__.return_value = mock_file
|
|
|
|
scraper.save_state(state_to_save)
|
|
|
|
# Check that something was written
|
|
assert len(written_data) > 0
|
|
# Parse the written JSON
|
|
written_json = ''.join(written_data)
|
|
assert json.loads(written_json) == state_to_save
|
|
|
|
def test_generate_filename(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
with patch.object(BaseScraper, '__abstractmethods__', set()):
|
|
with patch('src.base_scraper.datetime') as mock_datetime:
|
|
mock_dt = MagicMock()
|
|
mock_dt.strftime.return_value = "2024-15-01-T143045"
|
|
mock_datetime.now.return_value = mock_dt
|
|
|
|
scraper = BaseScraper(config)
|
|
filename = scraper.generate_filename()
|
|
assert filename == "hvacknowitall_test_2024-15-01-T143045.md"
|
|
|
|
def test_archive_current_file(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
with patch.object(BaseScraper, '__abstractmethods__', set()):
|
|
with patch('pathlib.Path.glob') as mock_glob:
|
|
with patch('shutil.move') as mock_move:
|
|
mock_glob.return_value = [Path("data/markdown_current/hvacknowitall_test_old.md")]
|
|
|
|
scraper = BaseScraper(config)
|
|
scraper.archive_current_file()
|
|
|
|
mock_move.assert_called_once()
|
|
|
|
def test_convert_to_markdown(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
with patch.object(BaseScraper, '__abstractmethods__', set()):
|
|
with patch('src.base_scraper.MarkItDown') as mock_markitdown:
|
|
mock_converter = MagicMock()
|
|
mock_markitdown.return_value = mock_converter
|
|
mock_result = MagicMock()
|
|
mock_result.text_content = "# Converted Content"
|
|
mock_converter.convert_stream.return_value = mock_result
|
|
|
|
scraper = BaseScraper(config)
|
|
result = scraper.convert_to_markdown("<html><body>Test</body></html>")
|
|
assert result == "# Converted Content"
|
|
|
|
def test_abstract_methods_must_be_implemented(self):
|
|
config = ScraperConfig(
|
|
source_name="test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
with pytest.raises(TypeError):
|
|
scraper = BaseScraper(config) |