import pytest from unittest.mock import Mock, patch, MagicMock from datetime import datetime import json from pathlib import Path from src.base_scraper import BaseScraper, ScraperConfig class TestBaseScraper: def test_scraper_config_initialization(self): config = ScraperConfig( source_name="test_source", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) assert config.source_name == "test_source" assert config.brand_name == "hvacknowitall" assert config.data_dir == Path("data") assert config.logs_dir == Path("logs") assert config.timezone == "America/Halifax" def test_base_scraper_initialization(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) with patch.object(BaseScraper, '__abstractmethods__', set()): scraper = BaseScraper(config) assert scraper.config == config assert scraper.state_file == Path("data") / ".state" / "test_state.json" assert scraper.logger is not None def test_load_state_creates_new_when_missing(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) with patch.object(BaseScraper, '__abstractmethods__', set()): with patch('pathlib.Path.exists', return_value=False): scraper = BaseScraper(config) state = scraper.load_state() assert state == {} def test_load_state_reads_existing_file(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) expected_state = {"last_id": "123", "last_update": "2024-01-01"} with patch.object(BaseScraper, '__abstractmethods__', set()): with patch('pathlib.Path.exists', return_value=True): with patch('builtins.open', create=True) as mock_open: mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(expected_state) scraper = BaseScraper(config) state = scraper.load_state() assert state == expected_state def test_save_state(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) state_to_save = {"last_id": "456", "last_update": "2024-01-02"} with patch.object(BaseScraper, '__abstractmethods__', set()): scraper = BaseScraper(config) with patch('builtins.open', create=True) as mock_open: # Create a list to capture the written data written_data = [] def write_side_effect(data): written_data.append(data) mock_file = MagicMock() mock_file.write.side_effect = write_side_effect mock_open.return_value.__enter__.return_value = mock_file scraper.save_state(state_to_save) # Check that something was written assert len(written_data) > 0 # Parse the written JSON written_json = ''.join(written_data) assert json.loads(written_json) == state_to_save def test_generate_filename(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) with patch.object(BaseScraper, '__abstractmethods__', set()): with patch('src.base_scraper.datetime') as mock_datetime: mock_dt = MagicMock() mock_dt.strftime.return_value = "2024-15-01-T143045" mock_datetime.now.return_value = mock_dt scraper = BaseScraper(config) filename = scraper.generate_filename() assert filename == "hvacknowitall_test_2024-15-01-T143045.md" def test_archive_current_file(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) with patch.object(BaseScraper, '__abstractmethods__', set()): with patch('pathlib.Path.glob') as mock_glob: with patch('shutil.move') as mock_move: mock_glob.return_value = [Path("data/markdown_current/hvacknowitall_test_old.md")] scraper = BaseScraper(config) scraper.archive_current_file() mock_move.assert_called_once() def test_convert_to_markdown(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) with patch.object(BaseScraper, '__abstractmethods__', set()): with patch('src.base_scraper.MarkItDown') as mock_markitdown: mock_converter = MagicMock() mock_markitdown.return_value = mock_converter mock_result = MagicMock() mock_result.text_content = "# Converted Content" mock_converter.convert_stream.return_value = mock_result scraper = BaseScraper(config) result = scraper.convert_to_markdown("Test") assert result == "# Converted Content" def test_abstract_methods_must_be_implemented(self): config = ScraperConfig( source_name="test", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) with pytest.raises(TypeError): scraper = BaseScraper(config)