Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
107 lines
No EOL
3.4 KiB
Python
107 lines
No EOL
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Verify the processing logic doesn't have bugs
|
|
"""
|
|
|
|
import re
|
|
|
|
def test_clean_content():
|
|
"""Test the _clean_content method with various inputs"""
|
|
|
|
# Simulate the cleaning patterns from the scraper
|
|
patterns_to_remove = [
|
|
# Header patterns
|
|
r'VIEW THIS EMAIL IN BROWSER[^\n]*\n?',
|
|
r'\(\*\|ARCHIVE\|\*\)[^\n]*\n?',
|
|
r'https://hvacknowitall\.com/?\n?',
|
|
|
|
# Footer patterns
|
|
r'Newsletter produced by Teal Maker[^\n]*\n?',
|
|
r'https://tealmaker\.com[^\n]*\n?',
|
|
r'https://open\.spotify\.com[^\n]*\n?',
|
|
r'https://www\.instagram\.com[^\n]*\n?',
|
|
r'https://www\.youtube\.com[^\n]*\n?',
|
|
r'https://www\.facebook\.com[^\n]*\n?',
|
|
r'https://x\.com[^\n]*\n?',
|
|
r'https://www\.linkedin\.com[^\n]*\n?',
|
|
r'Copyright \(C\)[^\n]*\n?',
|
|
r'\*\|CURRENT_YEAR\|\*[^\n]*\n?',
|
|
r'\*\|LIST:COMPANY\|\*[^\n]*\n?',
|
|
r'\*\|IFNOT:ARCHIVE_PAGE\|\*[^\n]*\*\|END:IF\|\*\n?',
|
|
r'\*\|LIST:DESCRIPTION\|\*[^\n]*\n?',
|
|
r'\*\|LIST_ADDRESS\|\*[^\n]*\n?',
|
|
r'Our mailing address is:[^\n]*\n?',
|
|
r'Want to change how you receive these emails\?[^\n]*\n?',
|
|
r'You can update your preferences[^\n]*\n?',
|
|
r'\(\*\|UPDATE_PROFILE\|\*\)[^\n]*\n?',
|
|
r'or unsubscribe[^\n]*\n?',
|
|
r'\(\*\|UNSUB\|\*\)[^\n]*\n?',
|
|
|
|
# Clean up multiple newlines
|
|
r'\n{3,}',
|
|
]
|
|
|
|
def _clean_content(content):
|
|
if not content:
|
|
return content
|
|
|
|
cleaned = content
|
|
for pattern in patterns_to_remove:
|
|
cleaned = re.sub(pattern, '', cleaned, flags=re.MULTILINE | re.IGNORECASE)
|
|
|
|
# Clean up multiple newlines (replace with double newline)
|
|
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
|
|
|
# Trim whitespace
|
|
cleaned = cleaned.strip()
|
|
|
|
return cleaned
|
|
|
|
# Test cases
|
|
test_cases = [
|
|
# Empty content
|
|
("", "Empty content should return empty"),
|
|
|
|
# None content
|
|
(None, "None content should return None"),
|
|
|
|
# Typical newsletter content
|
|
("""VIEW THIS EMAIL IN BROWSER (*|ARCHIVE|*)
|
|
https://hvacknowitall.com/
|
|
|
|
7 August, 2025
|
|
|
|
I know what you're thinking - "Is this guy seriously talking about heating maintenance while I'm still sweating through AC calls?"
|
|
|
|
Yes, I am.
|
|
|
|
This week's blog articles provide the complete blueprint.""", "Real newsletter content should be mostly preserved"),
|
|
|
|
# Only header/footer content
|
|
("""VIEW THIS EMAIL IN BROWSER (*|ARCHIVE|*)
|
|
https://hvacknowitall.com/
|
|
|
|
Newsletter produced by Teal Maker
|
|
https://tealmaker.com""", "Only header/footer should be cleaned to empty or near-empty"),
|
|
|
|
# Mixed content
|
|
("""Some real content here about HVAC systems.
|
|
|
|
https://hvacknowitall.com/
|
|
|
|
More real content about heating and cooling.""", "Mixed content should preserve the real parts")
|
|
]
|
|
|
|
print("Testing _clean_content method:")
|
|
print("=" * 60)
|
|
|
|
for i, (test_input, description) in enumerate(test_cases, 1):
|
|
print(f"\nTest {i}: {description}")
|
|
print(f"Input: {repr(test_input)}")
|
|
|
|
result = _clean_content(test_input)
|
|
print(f"Output: {repr(result)}")
|
|
print(f"Output length: {len(result) if result else 0}")
|
|
|
|
if __name__ == "__main__":
|
|
test_clean_content() |