import { JSDOM } from 'jsdom'; import fs from 'fs/promises'; import path from 'path'; export interface HtmlDumpOptions { removeScripts?: boolean; removeStyles?: boolean; removeInlineHandlers?: boolean; removeComments?: boolean; removeDataAttributes?: boolean; preserveSelectors?: string[]; excludeSelectors?: string[]; } interface ReductionStats { originalSize: number; removedScripts: number; removedStyles: number; removedHandlers: number; removedComments: number; removedDataAttrs: number; finalSize: number; reductionPercentage: number; } export class HtmlDumpParser { private defaultOptions: HtmlDumpOptions = { removeScripts: true, removeStyles: true, removeInlineHandlers: true, removeComments: true, removeDataAttributes: true, preserveSelectors: [], excludeSelectors: [] }; async parseAndReduce( htmlContent: string, options: HtmlDumpOptions = {} ): Promise<{ content: string; reductionStats: ReductionStats }> { const opts = { ...this.defaultOptions, ...options }; const originalSize = htmlContent.length; const dom = new JSDOM(htmlContent); const document = dom.window.document; // Track reduction statistics const stats: ReductionStats = { originalSize, removedScripts: 0, removedStyles: 0, removedHandlers: 0, removedComments: 0, removedDataAttrs: 0, finalSize: 0, reductionPercentage: 0 }; // Process preserved elements first const preservedContent = new Map(); if (opts.preserveSelectors?.length) { opts.preserveSelectors.forEach(selector => { document.querySelectorAll(selector).forEach((el: Element, index: number) => { const placeholder = `__PRESERVED_${selector}_${index}__`; preservedContent.set(placeholder, el.outerHTML); el.outerHTML = placeholder; }); }); } // Remove excluded elements if (opts.excludeSelectors?.length) { opts.excludeSelectors.forEach(selector => { document.querySelectorAll(selector).forEach((el: Element) => el.remove()); }); } // Remove scripts if (opts.removeScripts) { stats.removedScripts = this.removeElements(document, 'script'); } // Remove styles if (opts.removeStyles) { stats.removedStyles = this.removeElements(document, 'style, link[rel="stylesheet"]'); } // Remove inline handlers if (opts.removeInlineHandlers) { stats.removedHandlers = this.removeInlineHandlers(document); } // Remove comments if (opts.removeComments) { stats.removedComments = this.removeComments(document); } // Remove data attributes if (opts.removeDataAttributes) { stats.removedDataAttrs = this.removeDataAttributes(document); } // Restore preserved content let reducedHtml = document.documentElement.outerHTML; preservedContent.forEach((content, placeholder) => { reducedHtml = reducedHtml.replace(placeholder, content); }); // Calculate final statistics stats.finalSize = reducedHtml.length; stats.reductionPercentage = ((originalSize - stats.finalSize) / originalSize) * 100; return { content: reducedHtml, reductionStats: stats }; } async processFile( inputPath: string, outputPath: string, options: HtmlDumpOptions = {} ): Promise { const content = await fs.readFile(inputPath, 'utf8'); const { content: reducedContent, reductionStats } = await this.parseAndReduce(content, options); await fs.mkdir(path.dirname(outputPath), { recursive: true }); await fs.writeFile(outputPath, reducedContent); return reductionStats; } private removeElements(document: Document, selector: string): number { const elements = document.querySelectorAll(selector); elements.forEach((el: Element) => el.remove()); return elements.length; } private removeInlineHandlers(document: Document): number { let count = 0; document.querySelectorAll('*').forEach((el: Element) => { const attrs = el.attributes; for (let i = attrs.length - 1; i >= 0; i--) { if (attrs[i].name.startsWith('on')) { el.removeAttribute(attrs[i].name); count++; } } }); return count; } private removeComments(document: Document): number { let count = 0; const iterator = document.createNodeIterator( document, NodeFilter.SHOW_COMMENT, null ); let node: Node | null; while ((node = iterator.nextNode())) { node.parentNode?.removeChild(node); count++; } return count; } private removeDataAttributes(document: Document): number { let count = 0; document.querySelectorAll('*').forEach((el: Element) => { const attrs = el.attributes; for (let i = attrs.length - 1; i >= 0; i--) { if (attrs[i].name.startsWith('data-')) { el.removeAttribute(attrs[i].name); count++; } } }); return count; } }