#!/usr/bin/env python3 """ strip_page.py - HTML 脱脂脚本 v2 修复:全面处理所有格式的 base64 图片(无引号/双引号/单引号/url()/style属性) """ import re import os import sys from pathlib import Path STRIP_THRESHOLD_STYLE_CHARS = 50000 BASE64_PLACEHOLDER = 'data:image/png;base64,iVBORw0KGgo=' def extract_css_variables(style_text): vars_found = [] for m in re.finditer(r'(--[\w-]+)\s*:\s*([^;{]+?)(?:\s*!important)?;', style_text): vars_found.append(f' {m.group(1)}: {m.group(2).strip()};') return vars_found def strip_html(input_path, output_dir=None): input_path = Path(input_path) if output_dir is None: output_dir = input_path.parent output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) stem = input_path.stem html_out = output_dir / f'{stem}.stripped.html' css_out = output_dir / f'{stem}.styles.css' vars_out = output_dir / f'{stem}.variables.css' with open(input_path, 'r', encoding='utf-8', errors='replace') as f: html = f.read() original_size = len(html) stats = { 'base64_replaced': 0, 'base64_saved_chars': 0, 'styles_extracted': 0, 'styles_saved_chars': 0, 'css_vars_extracted': 0, 'scripts_removed': 0, 'data_hrefs_fixed': 0, } # ── 0. 扫描所有 CSS 变量(先于任何替换)──────────── all_vars = [] for m in re.finditer(r']*>(.*?)', html, flags=re.DOTALL): all_vars.extend(extract_css_variables(m.group(1))) # 也从 style 属性中提取 for m in re.finditer(r'style="([^"]+)"', html): all_vars.extend(extract_css_variables(m.group(1))) # ── 1. 全面替换所有 base64 图片 ───────────────── # 格式1: url(data:image/...) 无引号 def repl_url_nq(m): stats['base64_replaced'] += 1 stats['base64_saved_chars'] += len(m.group(1)) return f'url("{BASE64_PLACEHOLDER}")' n = len(re.findall(r'url\((data:image[^\s\)]+)\)', html)) html = re.sub(r'url\((data:image[^\s\)]+)\)', repl_url_nq, html) # 格式2: url("data:image/...") 双引号 def repl_url_dq(m): stats['base64_replaced'] += 1 stats['base64_saved_chars'] += len(m.group(1)) return f'url("{BASE64_PLACEHOLDER}")' html = re.sub(r'url\("(data:image[^"]+)"\)', repl_url_dq, html) # 格式3: url('data:image/...') 单引号 def repl_url_sq(m): stats['base64_replaced'] += 1 stats['base64_saved_chars'] += len(m.group(1)) return f"url('{BASE64_PLACEHOLDER}')" html = re.sub(r"url\('(data:image[^']+)'\)", repl_url_sq, html) # 格式4: style="background-image:url(data:...)" style属性中无引号 def repl_style_nq(m): prefix = m.group(1) data = m.group(2) stats['base64_replaced'] += 1 stats['base64_saved_chars'] += len(data) return f'{prefix}url("{BASE64_PLACEHOLDER}")' html = re.sub(r'(background-image\s*:\s*)url\((data:image[^\s\)]+)\)', repl_style_nq, html, flags=re.IGNORECASE) # 格式5: style="background-image:url("data:...")" style属性中有引号 def repl_style_dq(m): prefix = m.group(1) data = m.group(2) stats['base64_replaced'] += 1 stats['base64_saved_chars'] += len(data) return f'{prefix}url("{BASE64_PLACEHOLDER}")' html = re.sub(r'(background-image\s*:\s*)url\("(data:image[^"]+)"\)', repl_style_dq, html, flags=re.IGNORECASE) # 格式6: def repl_img(m): attrs = m.group(1) stats['base64_replaced'] += 1 stats['base64_saved_chars'] += len(m.group(2)) return f']*?)src="(data:image/[^"]+)"', repl_img, html, flags=re.IGNORECASE) # ── 2. 处理极大的内联 style ────────────────────── extracted_styles = [] def process_style(m): tag = m.group(1) attrs = m.group(2) content = m.group(3) if len(content) < STRIP_THRESHOLD_STYLE_CHARS: return m.group(0) stats['styles_extracted'] += 1 stats['styles_saved_chars'] += len(content) all_vars.extend(extract_css_variables(content)) style_id = f'extracted-style-{stats["styles_extracted"]}' html_tag = f'' extracted_styles.append((style_id, content)) return f'' html = re.sub(r'(]*?)>(.*?)', process_style, html, flags=re.DOTALL) # ── 3. 写外置 CSS ──────────────────────────────── if extracted_styles: css_parts = ['/* === Extracted inline styles === */', ''] for style_id, content in extracted_styles: css_parts.append(f'/* --- {style_id} --- */') css_parts.append(content) css_parts.append('') css_out.write_text('\n'.join(css_parts), encoding='utf-8') # ── 4. 写变量文件 ──────────────────────────────── if all_vars: seen = set() unique_vars = [] for v in all_vars: key = v.strip().split(':')[0] if key not in seen and key.startswith('--'): seen.add(key) unique_vars.append(v) vars_out.write_text( '/* === Extracted CSS Variables === */\n:root {\n' + '\n'.join(unique_vars) + '\n}\n', encoding='utf-8' ) stats['css_vars_extracted'] = len(unique_vars) # ── 5. 移除 inline script ──────────────────────── stats['scripts_removed'] = len(re.findall(r']*>.*?', html, flags=re.DOTALL)) html = re.sub(r']*>.*?', '', html, flags=re.DOTALL) html = re.sub(r']*/>\s*', '', html) # ── 6. 清理 data: href ────────────────────────── stats['data_hrefs_fixed'] = len(re.findall(r'href="data:[^"]+"', html)) html = re.sub(r'href="(data:[^"]+)"', 'href="#data-link-removed"', html) # ── 7. 写脱脂 HTML ────────────────────────────── html = re.sub(r'', '', html) if extracted_styles and '', f'\n ', 1) html_out.write_text(html, encoding='utf-8') new_html_size = len(html) css_size = css_out.stat().st_size if css_out.exists() else 0 vars_size = vars_out.stat().st_size if vars_out.exists() else 0 saved = original_size - new_html_size - css_size print(f''' ✅ 脱脂完成! {'='*54} 📄 输入: {input_path.name} 原始大小: {original_size/1024:.1f} KB ({original_size:,} 字符) {'='*54} 📊 处理结果: base64 图片替换: {stats["base64_replaced"]} 个 base64 节省: {stats["base64_saved_chars"]/1024:.1f} KB style 块提取: {stats["styles_extracted"]} 个 style 节省: {stats["styles_saved_chars"]/1024:.1f} KB CSS 变量提取: {stats["css_vars_extracted"]} 个 script 移除: {stats["scripts_removed"]} 个 data href 修复: {stats["data_hrefs_fixed"]} 个 {'='*54} 📦 输出文件: HTML (脱脂): {html_out.name} ({new_html_size/1024:.1f} KB) CSS (提取样式): {css_out.name} ({css_size/1024:.1f} KB) CSS (变量): {vars_out.name} ({vars_size/1024:.1f} KB) {'='*54} 💾 HTML 体积压缩: {new_html_size/1024:.1f} KB (原始 {original_size/1024:.1f} KB) 节省比例: {(saved)/original_size*100:.1f}% ''') if __name__ == '__main__': if len(sys.argv) < 2: print('用法: python3 strip_page.py [output_dir]') sys.exit(1) strip_html(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)