hvac-kia-content/consolidate_current_files.py

#!/usr/bin/env python3
"""
Consolidate multiple markdown files per source into single current files
Combines backlog data and incremental updates into one source of truth
Follows project specification naming: hvacnkowitall_<source>_<dateTime>.md
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from datetime import datetime
import pytz
import re
from typing import Dict, List, Set
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/consolidation.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('consolidator')


def get_atlantic_timestamp() -> str:
    """Get current timestamp in Atlantic timezone."""
    tz = pytz.timezone('America/Halifax')
    return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S')


def parse_markdown_sections(content: str) -> List[Dict]:
    """Parse markdown content into sections by ID."""
    sections = []

    # Split by ID headers
    parts = content.split('# ID: ')

    for part in parts[1:]:  # Skip first empty part
        if not part.strip():
            continue

        lines = part.strip().split('\n')
        section_id = lines[0].strip()

        # Get the full section content
        section_content = f"# ID: {section_id}\n" + '\n'.join(lines[1:])

        sections.append({
            'id': section_id,
            'content': section_content
        })

    return sections


def consolidate_source_files(source_name: str) -> bool:
    """Consolidate all files for a specific source into one current file."""
    logger.info(f"Consolidating {source_name} files...")

    current_dir = Path('data/markdown_current')
    archives_dir = Path('data/markdown_archives')

    # Find all files for this source
    pattern = f"hvacnkowitall_{source_name}_*.md"
    current_files = list(current_dir.glob(pattern))

    # Also check for files with different naming (like captions files)
    alt_patterns = [
        f"*{source_name}*.md",
        f"hvacnkowitall_{source_name.lower()}_*.md"
    ]

    for alt_pattern in alt_patterns:
        current_files.extend(current_dir.glob(alt_pattern))

    # Remove duplicates
    current_files = list(set(current_files))

    if not current_files:
        logger.warning(f"No files found for source: {source_name}")
        return False

    logger.info(f"Found {len(current_files)} files for {source_name}: {[f.name for f in current_files]}")

    # Track unique sections by ID
    sections_by_id: Dict[str, Dict] = {}
    all_sections = []

    # Process each file
    for file_path in current_files:
        logger.info(f"Processing {file_path.name}...")

        try:
            content = file_path.read_text(encoding='utf-8')
            sections = parse_markdown_sections(content)

            logger.info(f"  Found {len(sections)} sections")

            # Add sections, preferring newer data
            for section in sections:
                section_id = section['id']

                # If we haven't seen this ID, add it
                if section_id not in sections_by_id:
                    sections_by_id[section_id] = section
                    all_sections.append(section)
                else:
                    # Check if this version has more content (like captions)
                    old_content = sections_by_id[section_id]['content']
                    new_content = section['content']

                    # Prefer content with captions/more detail
                    if ('Caption Status:' in new_content and 'Caption Status:' not in old_content) or \
                       len(new_content) > len(old_content):
                        logger.info(f"  Updating section {section_id} with more detailed content")
                        # Update in place
                        for i, existing in enumerate(all_sections):
                            if existing['id'] == section_id:
                                all_sections[i] = section
                                sections_by_id[section_id] = section
                                break

        except Exception as e:
            logger.error(f"Error processing {file_path}: {e}")
            continue

    if not all_sections:
        logger.warning(f"No sections found for {source_name}")
        return False

    # Create consolidated content
    consolidated_content = []

    # Sort sections by ID for consistency
    all_sections.sort(key=lambda x: x['id'])

    for section in all_sections:
        consolidated_content.append(section['content'])
        consolidated_content.append("")  # Add separator

    # Generate new filename following project specification
    timestamp = get_atlantic_timestamp()
    new_filename = f"hvacnkowitall_{source_name}_{timestamp}.md"
    new_file_path = current_dir / new_filename

    # Save consolidated file
    final_content = '\n'.join(consolidated_content)
    new_file_path.write_text(final_content, encoding='utf-8')

    logger.info(f"Created consolidated file: {new_filename}")
    logger.info(f"  Total sections: {len(all_sections)}")
    logger.info(f"  File size: {len(final_content):,} characters")

    # Archive old files
    archive_source_dir = archives_dir / source_name
    archive_source_dir.mkdir(parents=True, exist_ok=True)

    archived_count = 0
    for old_file in current_files:
        if old_file.name != new_filename:  # Don't archive the new file
            try:
                archive_path = archive_source_dir / old_file.name
                old_file.rename(archive_path)
                archived_count += 1
                logger.info(f"  Archived: {old_file.name}")
            except Exception as e:
                logger.error(f"Error archiving {old_file.name}: {e}")

    logger.info(f"Archived {archived_count} old files for {source_name}")

    # Create copy in archives as well
    archive_current_path = archive_source_dir / new_filename
    archive_current_path.write_text(final_content, encoding='utf-8')

    return True


def main():
    """Main consolidation function."""
    logger.info("=" * 60)
    logger.info("CONSOLIDATING CURRENT MARKDOWN FILES")
    logger.info("=" * 60)

    # Create directories if needed
    Path('data/markdown_current').mkdir(parents=True, exist_ok=True)
    Path('data/markdown_archives').mkdir(parents=True, exist_ok=True)
    Path('logs').mkdir(parents=True, exist_ok=True)

    # Define sources to consolidate
    sources = ['YouTube', 'MailChimp', 'Instagram', 'TikTok', 'Podcast']

    consolidated = []
    failed = []

    for source in sources:
        logger.info(f"\n{'-' * 40}")
        try:
            if consolidate_source_files(source):
                consolidated.append(source)
            else:
                failed.append(source)
        except Exception as e:
            logger.error(f"Failed to consolidate {source}: {e}")
            failed.append(source)

    logger.info(f"\n{'=' * 60}")
    logger.info("CONSOLIDATION SUMMARY")
    logger.info(f"{'=' * 60}")
    logger.info(f"Successfully consolidated: {consolidated}")
    logger.info(f"Failed/No data: {failed}")

    # List final current files
    current_files = list(Path('data/markdown_current').glob('*.md'))
    logger.info(f"\nFinal current files:")
    for file in sorted(current_files):
        size = file.stat().st_size
        logger.info(f"  {file.name} ({size:,} bytes)")


if __name__ == "__main__":
    main()