206 lines
8.1 KiB
Python
206 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
strip_page.py - HTML 脱脂脚本 v2
|
||
修复:全面处理所有格式的 base64 图片(无引号/双引号/单引号/url()/style属性)
|
||
"""
|
||
|
||
import re
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
STRIP_THRESHOLD_STYLE_CHARS = 50000
|
||
BASE64_PLACEHOLDER = 'data:image/png;base64,iVBORw0KGgo='
|
||
|
||
|
||
def extract_css_variables(style_text):
|
||
vars_found = []
|
||
for m in re.finditer(r'(--[\w-]+)\s*:\s*([^;{]+?)(?:\s*!important)?;', style_text):
|
||
vars_found.append(f' {m.group(1)}: {m.group(2).strip()};')
|
||
return vars_found
|
||
|
||
|
||
def strip_html(input_path, output_dir=None):
|
||
input_path = Path(input_path)
|
||
if output_dir is None:
|
||
output_dir = input_path.parent
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
stem = input_path.stem
|
||
html_out = output_dir / f'{stem}.stripped.html'
|
||
css_out = output_dir / f'{stem}.styles.css'
|
||
vars_out = output_dir / f'{stem}.variables.css'
|
||
|
||
with open(input_path, 'r', encoding='utf-8', errors='replace') as f:
|
||
html = f.read()
|
||
|
||
original_size = len(html)
|
||
stats = {
|
||
'base64_replaced': 0,
|
||
'base64_saved_chars': 0,
|
||
'styles_extracted': 0,
|
||
'styles_saved_chars': 0,
|
||
'css_vars_extracted': 0,
|
||
'scripts_removed': 0,
|
||
'data_hrefs_fixed': 0,
|
||
}
|
||
|
||
# ── 0. 扫描所有 CSS 变量(先于任何替换)────────────
|
||
all_vars = []
|
||
for m in re.finditer(r'<style[^>]*>(.*?)</style>', html, flags=re.DOTALL):
|
||
all_vars.extend(extract_css_variables(m.group(1)))
|
||
# 也从 style 属性中提取
|
||
for m in re.finditer(r'style="([^"]+)"', html):
|
||
all_vars.extend(extract_css_variables(m.group(1)))
|
||
|
||
# ── 1. 全面替换所有 base64 图片 ─────────────────
|
||
# 格式1: url(data:image/...) 无引号
|
||
def repl_url_nq(m):
|
||
stats['base64_replaced'] += 1
|
||
stats['base64_saved_chars'] += len(m.group(1))
|
||
return f'url("{BASE64_PLACEHOLDER}")'
|
||
n = len(re.findall(r'url\((data:image[^\s\)]+)\)', html))
|
||
html = re.sub(r'url\((data:image[^\s\)]+)\)', repl_url_nq, html)
|
||
|
||
# 格式2: url("data:image/...") 双引号
|
||
def repl_url_dq(m):
|
||
stats['base64_replaced'] += 1
|
||
stats['base64_saved_chars'] += len(m.group(1))
|
||
return f'url("{BASE64_PLACEHOLDER}")'
|
||
html = re.sub(r'url\("(data:image[^"]+)"\)', repl_url_dq, html)
|
||
|
||
# 格式3: url('data:image/...') 单引号
|
||
def repl_url_sq(m):
|
||
stats['base64_replaced'] += 1
|
||
stats['base64_saved_chars'] += len(m.group(1))
|
||
return f"url('{BASE64_PLACEHOLDER}')"
|
||
html = re.sub(r"url\('(data:image[^']+)'\)", repl_url_sq, html)
|
||
|
||
# 格式4: style="background-image:url(data:...)" style属性中无引号
|
||
def repl_style_nq(m):
|
||
prefix = m.group(1)
|
||
data = m.group(2)
|
||
stats['base64_replaced'] += 1
|
||
stats['base64_saved_chars'] += len(data)
|
||
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
|
||
html = re.sub(r'(background-image\s*:\s*)url\((data:image[^\s\)]+)\)', repl_style_nq, html, flags=re.IGNORECASE)
|
||
|
||
# 格式5: style="background-image:url("data:...")" style属性中有引号
|
||
def repl_style_dq(m):
|
||
prefix = m.group(1)
|
||
data = m.group(2)
|
||
stats['base64_replaced'] += 1
|
||
stats['base64_saved_chars'] += len(data)
|
||
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
|
||
html = re.sub(r'(background-image\s*:\s*)url\("(data:image[^"]+)"\)', repl_style_dq, html, flags=re.IGNORECASE)
|
||
|
||
# 格式6: <img src="data:image/...">
|
||
def repl_img(m):
|
||
attrs = m.group(1)
|
||
stats['base64_replaced'] += 1
|
||
stats['base64_saved_chars'] += len(m.group(2))
|
||
return f'<img{attrs}src="{BASE64_PLACEHOLDER}"'
|
||
html = re.sub(r'<img([^>]*?)src="(data:image/[^"]+)"', repl_img, html, flags=re.IGNORECASE)
|
||
|
||
# ── 2. 处理极大的内联 style ──────────────────────
|
||
extracted_styles = []
|
||
|
||
def process_style(m):
|
||
tag = m.group(1)
|
||
attrs = m.group(2)
|
||
content = m.group(3)
|
||
|
||
if len(content) < STRIP_THRESHOLD_STYLE_CHARS:
|
||
return m.group(0)
|
||
|
||
stats['styles_extracted'] += 1
|
||
stats['styles_saved_chars'] += len(content)
|
||
all_vars.extend(extract_css_variables(content))
|
||
|
||
style_id = f'extracted-style-{stats["styles_extracted"]}'
|
||
html_tag = f'<link rel="stylesheet" id="{style_id}" href="{stem}.styles.css">'
|
||
extracted_styles.append((style_id, content))
|
||
return f'<!-- {style_id} moved to external CSS -->'
|
||
|
||
html = re.sub(r'(<style)([^>]*?)>(.*?)</style>', process_style, html, flags=re.DOTALL)
|
||
|
||
# ── 3. 写外置 CSS ────────────────────────────────
|
||
if extracted_styles:
|
||
css_parts = ['/* === Extracted inline styles === */', '']
|
||
for style_id, content in extracted_styles:
|
||
css_parts.append(f'/* --- {style_id} --- */')
|
||
css_parts.append(content)
|
||
css_parts.append('')
|
||
css_out.write_text('\n'.join(css_parts), encoding='utf-8')
|
||
|
||
# ── 4. 写变量文件 ────────────────────────────────
|
||
if all_vars:
|
||
seen = set()
|
||
unique_vars = []
|
||
for v in all_vars:
|
||
key = v.strip().split(':')[0]
|
||
if key not in seen and key.startswith('--'):
|
||
seen.add(key)
|
||
unique_vars.append(v)
|
||
vars_out.write_text(
|
||
'/* === Extracted CSS Variables === */\n:root {\n' +
|
||
'\n'.join(unique_vars) +
|
||
'\n}\n',
|
||
encoding='utf-8'
|
||
)
|
||
stats['css_vars_extracted'] = len(unique_vars)
|
||
|
||
# ── 5. 移除 inline script ────────────────────────
|
||
stats['scripts_removed'] = len(re.findall(r'<script[^>]*>.*?</script>', html, flags=re.DOTALL))
|
||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
|
||
html = re.sub(r'<script[^>]*/>\s*', '', html)
|
||
|
||
# ── 6. 清理 data: href ──────────────────────────
|
||
stats['data_hrefs_fixed'] = len(re.findall(r'href="data:[^"]+"', html))
|
||
html = re.sub(r'href="(data:[^"]+)"', 'href="#data-link-removed"', html)
|
||
|
||
# ── 7. 写脱脂 HTML ──────────────────────────────
|
||
html = re.sub(r'<!--\s*-->', '', html)
|
||
|
||
if extracted_styles and '<link rel="stylesheet"' not in html:
|
||
html = html.replace('<head>', f'<head>\n <link rel="stylesheet" href="{stem}.styles.css">', 1)
|
||
|
||
html_out.write_text(html, encoding='utf-8')
|
||
|
||
new_html_size = len(html)
|
||
css_size = css_out.stat().st_size if css_out.exists() else 0
|
||
vars_size = vars_out.stat().st_size if vars_out.exists() else 0
|
||
saved = original_size - new_html_size - css_size
|
||
|
||
print(f'''
|
||
✅ 脱脂完成!
|
||
{'='*54}
|
||
📄 输入: {input_path.name}
|
||
原始大小: {original_size/1024:.1f} KB ({original_size:,} 字符)
|
||
{'='*54}
|
||
📊 处理结果:
|
||
base64 图片替换: {stats["base64_replaced"]} 个
|
||
base64 节省: {stats["base64_saved_chars"]/1024:.1f} KB
|
||
style 块提取: {stats["styles_extracted"]} 个
|
||
style 节省: {stats["styles_saved_chars"]/1024:.1f} KB
|
||
CSS 变量提取: {stats["css_vars_extracted"]} 个
|
||
script 移除: {stats["scripts_removed"]} 个
|
||
data href 修复: {stats["data_hrefs_fixed"]} 个
|
||
{'='*54}
|
||
📦 输出文件:
|
||
HTML (脱脂): {html_out.name} ({new_html_size/1024:.1f} KB)
|
||
CSS (提取样式): {css_out.name} ({css_size/1024:.1f} KB)
|
||
CSS (变量): {vars_out.name} ({vars_size/1024:.1f} KB)
|
||
{'='*54}
|
||
💾 HTML 体积压缩: {new_html_size/1024:.1f} KB (原始 {original_size/1024:.1f} KB)
|
||
节省比例: {(saved)/original_size*100:.1f}%
|
||
''')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
if len(sys.argv) < 2:
|
||
print('用法: python3 strip_page.py <input.html> [output_dir]')
|
||
sys.exit(1)
|
||
strip_html(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)
|