#!/usr/bin/env python3 """ Verify the processing logic doesn't have bugs """ import re def test_clean_content(): """Test the _clean_content method with various inputs""" # Simulate the cleaning patterns from the scraper patterns_to_remove = [ # Header patterns r'VIEW THIS EMAIL IN BROWSER[^\n]*\n?', r'\(\*\|ARCHIVE\|\*\)[^\n]*\n?', r'https://hvacknowitall\.com/?\n?', # Footer patterns r'Newsletter produced by Teal Maker[^\n]*\n?', r'https://tealmaker\.com[^\n]*\n?', r'https://open\.spotify\.com[^\n]*\n?', r'https://www\.instagram\.com[^\n]*\n?', r'https://www\.youtube\.com[^\n]*\n?', r'https://www\.facebook\.com[^\n]*\n?', r'https://x\.com[^\n]*\n?', r'https://www\.linkedin\.com[^\n]*\n?', r'Copyright \(C\)[^\n]*\n?', r'\*\|CURRENT_YEAR\|\*[^\n]*\n?', r'\*\|LIST:COMPANY\|\*[^\n]*\n?', r'\*\|IFNOT:ARCHIVE_PAGE\|\*[^\n]*\*\|END:IF\|\*\n?', r'\*\|LIST:DESCRIPTION\|\*[^\n]*\n?', r'\*\|LIST_ADDRESS\|\*[^\n]*\n?', r'Our mailing address is:[^\n]*\n?', r'Want to change how you receive these emails\?[^\n]*\n?', r'You can update your preferences[^\n]*\n?', r'\(\*\|UPDATE_PROFILE\|\*\)[^\n]*\n?', r'or unsubscribe[^\n]*\n?', r'\(\*\|UNSUB\|\*\)[^\n]*\n?', # Clean up multiple newlines r'\n{3,}', ] def _clean_content(content): if not content: return content cleaned = content for pattern in patterns_to_remove: cleaned = re.sub(pattern, '', cleaned, flags=re.MULTILINE | re.IGNORECASE) # Clean up multiple newlines (replace with double newline) cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) # Trim whitespace cleaned = cleaned.strip() return cleaned # Test cases test_cases = [ # Empty content ("", "Empty content should return empty"), # None content (None, "None content should return None"), # Typical newsletter content ("""VIEW THIS EMAIL IN BROWSER (*|ARCHIVE|*) https://hvacknowitall.com/ 7 August, 2025 I know what you're thinking - "Is this guy seriously talking about heating maintenance while I'm still sweating through AC calls?" Yes, I am. This week's blog articles provide the complete blueprint.""", "Real newsletter content should be mostly preserved"), # Only header/footer content ("""VIEW THIS EMAIL IN BROWSER (*|ARCHIVE|*) https://hvacknowitall.com/ Newsletter produced by Teal Maker https://tealmaker.com""", "Only header/footer should be cleaned to empty or near-empty"), # Mixed content ("""Some real content here about HVAC systems. https://hvacknowitall.com/ More real content about heating and cooling.""", "Mixed content should preserve the real parts") ] print("Testing _clean_content method:") print("=" * 60) for i, (test_input, description) in enumerate(test_cases, 1): print(f"\nTest {i}: {description}") print(f"Input: {repr(test_input)}") result = _clean_content(test_input) print(f"Output: {repr(result)}") print(f"Output length: {len(result) if result else 0}") if __name__ == "__main__": test_clean_content()