#!/usr/bin/env python3 """ Consolidate multiple markdown files per source into single current files Combines backlog data and incremental updates into one source of truth Follows project specification naming: hvacnkowitall__.md """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from datetime import datetime import pytz import re from typing import Dict, List, Set import logging # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('logs/consolidation.log'), logging.StreamHandler() ] ) logger = logging.getLogger('consolidator') def get_atlantic_timestamp() -> str: """Get current timestamp in Atlantic timezone.""" tz = pytz.timezone('America/Halifax') return datetime.now(tz).strftime('%Y-%m-%dT%H%M%S') def parse_markdown_sections(content: str) -> List[Dict]: """Parse markdown content into sections by ID.""" sections = [] # Split by ID headers parts = content.split('# ID: ') for part in parts[1:]: # Skip first empty part if not part.strip(): continue lines = part.strip().split('\n') section_id = lines[0].strip() # Get the full section content section_content = f"# ID: {section_id}\n" + '\n'.join(lines[1:]) sections.append({ 'id': section_id, 'content': section_content }) return sections def consolidate_source_files(source_name: str) -> bool: """Consolidate all files for a specific source into one current file.""" logger.info(f"Consolidating {source_name} files...") current_dir = Path('data/markdown_current') archives_dir = Path('data/markdown_archives') # Find all files for this source pattern = f"hvacnkowitall_{source_name}_*.md" current_files = list(current_dir.glob(pattern)) # Also check for files with different naming (like captions files) alt_patterns = [ f"*{source_name}*.md", f"hvacnkowitall_{source_name.lower()}_*.md" ] for alt_pattern in alt_patterns: current_files.extend(current_dir.glob(alt_pattern)) # Remove duplicates current_files = list(set(current_files)) if not current_files: logger.warning(f"No files found for source: {source_name}") return False logger.info(f"Found {len(current_files)} files for {source_name}: {[f.name for f in current_files]}") # Track unique sections by ID sections_by_id: Dict[str, Dict] = {} all_sections = [] # Process each file for file_path in current_files: logger.info(f"Processing {file_path.name}...") try: content = file_path.read_text(encoding='utf-8') sections = parse_markdown_sections(content) logger.info(f" Found {len(sections)} sections") # Add sections, preferring newer data for section in sections: section_id = section['id'] # If we haven't seen this ID, add it if section_id not in sections_by_id: sections_by_id[section_id] = section all_sections.append(section) else: # Check if this version has more content (like captions) old_content = sections_by_id[section_id]['content'] new_content = section['content'] # Prefer content with captions/more detail if ('Caption Status:' in new_content and 'Caption Status:' not in old_content) or \ len(new_content) > len(old_content): logger.info(f" Updating section {section_id} with more detailed content") # Update in place for i, existing in enumerate(all_sections): if existing['id'] == section_id: all_sections[i] = section sections_by_id[section_id] = section break except Exception as e: logger.error(f"Error processing {file_path}: {e}") continue if not all_sections: logger.warning(f"No sections found for {source_name}") return False # Create consolidated content consolidated_content = [] # Sort sections by ID for consistency all_sections.sort(key=lambda x: x['id']) for section in all_sections: consolidated_content.append(section['content']) consolidated_content.append("") # Add separator # Generate new filename following project specification timestamp = get_atlantic_timestamp() new_filename = f"hvacnkowitall_{source_name}_{timestamp}.md" new_file_path = current_dir / new_filename # Save consolidated file final_content = '\n'.join(consolidated_content) new_file_path.write_text(final_content, encoding='utf-8') logger.info(f"Created consolidated file: {new_filename}") logger.info(f" Total sections: {len(all_sections)}") logger.info(f" File size: {len(final_content):,} characters") # Archive old files archive_source_dir = archives_dir / source_name archive_source_dir.mkdir(parents=True, exist_ok=True) archived_count = 0 for old_file in current_files: if old_file.name != new_filename: # Don't archive the new file try: archive_path = archive_source_dir / old_file.name old_file.rename(archive_path) archived_count += 1 logger.info(f" Archived: {old_file.name}") except Exception as e: logger.error(f"Error archiving {old_file.name}: {e}") logger.info(f"Archived {archived_count} old files for {source_name}") # Create copy in archives as well archive_current_path = archive_source_dir / new_filename archive_current_path.write_text(final_content, encoding='utf-8') return True def main(): """Main consolidation function.""" logger.info("=" * 60) logger.info("CONSOLIDATING CURRENT MARKDOWN FILES") logger.info("=" * 60) # Create directories if needed Path('data/markdown_current').mkdir(parents=True, exist_ok=True) Path('data/markdown_archives').mkdir(parents=True, exist_ok=True) Path('logs').mkdir(parents=True, exist_ok=True) # Define sources to consolidate sources = ['YouTube', 'MailChimp', 'Instagram', 'TikTok', 'Podcast'] consolidated = [] failed = [] for source in sources: logger.info(f"\n{'-' * 40}") try: if consolidate_source_files(source): consolidated.append(source) else: failed.append(source) except Exception as e: logger.error(f"Failed to consolidate {source}: {e}") failed.append(source) logger.info(f"\n{'=' * 60}") logger.info("CONSOLIDATION SUMMARY") logger.info(f"{'=' * 60}") logger.info(f"Successfully consolidated: {consolidated}") logger.info(f"Failed/No data: {failed}") # List final current files current_files = list(Path('data/markdown_current').glob('*.md')) logger.info(f"\nFinal current files:") for file in sorted(current_files): size = file.stat().st_size logger.info(f" {file.name} ({size:,} bytes)") if __name__ == "__main__": main()