hvac-kia-content/tests/test_base_scraper.py
Ben Reed f9a8e719a7 Initial commit: Project foundation with base scraper and tests
- Set up UV environment with all required packages
- Created comprehensive project structure
- Implemented abstract BaseScraper class with TDD
- Added documentation (project spec, implementation plan, status)
- Configured .env for credentials (not committed)
- All base scraper tests passing (9/9)

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 12:15:17 -03:00

175 lines
No EOL
6.6 KiB
Python

import pytest
from unittest.mock import Mock, patch, MagicMock
from datetime import datetime
import json
from pathlib import Path
from src.base_scraper import BaseScraper, ScraperConfig
class TestBaseScraper:
def test_scraper_config_initialization(self):
config = ScraperConfig(
source_name="test_source",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
assert config.source_name == "test_source"
assert config.brand_name == "hvacknowitall"
assert config.data_dir == Path("data")
assert config.logs_dir == Path("logs")
assert config.timezone == "America/Halifax"
def test_base_scraper_initialization(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
with patch.object(BaseScraper, '__abstractmethods__', set()):
scraper = BaseScraper(config)
assert scraper.config == config
assert scraper.state_file == Path("data") / ".state" / "test_state.json"
assert scraper.logger is not None
def test_load_state_creates_new_when_missing(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
with patch.object(BaseScraper, '__abstractmethods__', set()):
with patch('pathlib.Path.exists', return_value=False):
scraper = BaseScraper(config)
state = scraper.load_state()
assert state == {}
def test_load_state_reads_existing_file(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
expected_state = {"last_id": "123", "last_update": "2024-01-01"}
with patch.object(BaseScraper, '__abstractmethods__', set()):
with patch('pathlib.Path.exists', return_value=True):
with patch('builtins.open', create=True) as mock_open:
mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(expected_state)
scraper = BaseScraper(config)
state = scraper.load_state()
assert state == expected_state
def test_save_state(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
state_to_save = {"last_id": "456", "last_update": "2024-01-02"}
with patch.object(BaseScraper, '__abstractmethods__', set()):
scraper = BaseScraper(config)
with patch('builtins.open', create=True) as mock_open:
# Create a list to capture the written data
written_data = []
def write_side_effect(data):
written_data.append(data)
mock_file = MagicMock()
mock_file.write.side_effect = write_side_effect
mock_open.return_value.__enter__.return_value = mock_file
scraper.save_state(state_to_save)
# Check that something was written
assert len(written_data) > 0
# Parse the written JSON
written_json = ''.join(written_data)
assert json.loads(written_json) == state_to_save
def test_generate_filename(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
with patch.object(BaseScraper, '__abstractmethods__', set()):
with patch('src.base_scraper.datetime') as mock_datetime:
mock_dt = MagicMock()
mock_dt.strftime.return_value = "2024-15-01-T143045"
mock_datetime.now.return_value = mock_dt
scraper = BaseScraper(config)
filename = scraper.generate_filename()
assert filename == "hvacknowitall_test_2024-15-01-T143045.md"
def test_archive_current_file(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
with patch.object(BaseScraper, '__abstractmethods__', set()):
with patch('pathlib.Path.glob') as mock_glob:
with patch('shutil.move') as mock_move:
mock_glob.return_value = [Path("data/markdown_current/hvacknowitall_test_old.md")]
scraper = BaseScraper(config)
scraper.archive_current_file()
mock_move.assert_called_once()
def test_convert_to_markdown(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
with patch.object(BaseScraper, '__abstractmethods__', set()):
with patch('src.base_scraper.MarkItDown') as mock_markitdown:
mock_converter = MagicMock()
mock_markitdown.return_value = mock_converter
mock_result = MagicMock()
mock_result.text_content = "# Converted Content"
mock_converter.convert_stream.return_value = mock_result
scraper = BaseScraper(config)
result = scraper.convert_to_markdown("<html><body>Test</body></html>")
assert result == "# Converted Content"
def test_abstract_methods_must_be_implemented(self):
config = ScraperConfig(
source_name="test",
brand_name="hvacknowitall",
data_dir=Path("data"),
logs_dir=Path("logs"),
timezone="America/Halifax"
)
with pytest.raises(TypeError):
scraper = BaseScraper(config)