hvac-kia-content/verify_processing.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

107 lines
No EOL
3.4 KiB
Python

#!/usr/bin/env python3
"""
Verify the processing logic doesn't have bugs
"""
import re
def test_clean_content():
"""Test the _clean_content method with various inputs"""
# Simulate the cleaning patterns from the scraper
patterns_to_remove = [
# Header patterns
r'VIEW THIS EMAIL IN BROWSER[^\n]*\n?',
r'\(\*\|ARCHIVE\|\*\)[^\n]*\n?',
r'https://hvacknowitall\.com/?\n?',
# Footer patterns
r'Newsletter produced by Teal Maker[^\n]*\n?',
r'https://tealmaker\.com[^\n]*\n?',
r'https://open\.spotify\.com[^\n]*\n?',
r'https://www\.instagram\.com[^\n]*\n?',
r'https://www\.youtube\.com[^\n]*\n?',
r'https://www\.facebook\.com[^\n]*\n?',
r'https://x\.com[^\n]*\n?',
r'https://www\.linkedin\.com[^\n]*\n?',
r'Copyright \(C\)[^\n]*\n?',
r'\*\|CURRENT_YEAR\|\*[^\n]*\n?',
r'\*\|LIST:COMPANY\|\*[^\n]*\n?',
r'\*\|IFNOT:ARCHIVE_PAGE\|\*[^\n]*\*\|END:IF\|\*\n?',
r'\*\|LIST:DESCRIPTION\|\*[^\n]*\n?',
r'\*\|LIST_ADDRESS\|\*[^\n]*\n?',
r'Our mailing address is:[^\n]*\n?',
r'Want to change how you receive these emails\?[^\n]*\n?',
r'You can update your preferences[^\n]*\n?',
r'\(\*\|UPDATE_PROFILE\|\*\)[^\n]*\n?',
r'or unsubscribe[^\n]*\n?',
r'\(\*\|UNSUB\|\*\)[^\n]*\n?',
# Clean up multiple newlines
r'\n{3,}',
]
def _clean_content(content):
if not content:
return content
cleaned = content
for pattern in patterns_to_remove:
cleaned = re.sub(pattern, '', cleaned, flags=re.MULTILINE | re.IGNORECASE)
# Clean up multiple newlines (replace with double newline)
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
# Trim whitespace
cleaned = cleaned.strip()
return cleaned
# Test cases
test_cases = [
# Empty content
("", "Empty content should return empty"),
# None content
(None, "None content should return None"),
# Typical newsletter content
("""VIEW THIS EMAIL IN BROWSER (*|ARCHIVE|*)
https://hvacknowitall.com/
7 August, 2025
I know what you're thinking - "Is this guy seriously talking about heating maintenance while I'm still sweating through AC calls?"
Yes, I am.
This week's blog articles provide the complete blueprint.""", "Real newsletter content should be mostly preserved"),
# Only header/footer content
("""VIEW THIS EMAIL IN BROWSER (*|ARCHIVE|*)
https://hvacknowitall.com/
Newsletter produced by Teal Maker
https://tealmaker.com""", "Only header/footer should be cleaned to empty or near-empty"),
# Mixed content
("""Some real content here about HVAC systems.
https://hvacknowitall.com/
More real content about heating and cooling.""", "Mixed content should preserve the real parts")
]
print("Testing _clean_content method:")
print("=" * 60)
for i, (test_input, description) in enumerate(test_cases, 1):
print(f"\nTest {i}: {description}")
print(f"Input: {repr(test_input)}")
result = _clean_content(test_input)
print(f"Output: {repr(result)}")
print(f"Output length: {len(result) if result else 0}")
if __name__ == "__main__":
test_clean_content()