#!/usr/bin/env python3
"""
strip_page.py - HTML 脱脂脚本 v2
修复:全面处理所有格式的 base64 图片(无引号/双引号/单引号/url()/style属性)
"""
import re
import os
import sys
from pathlib import Path
STRIP_THRESHOLD_STYLE_CHARS = 50000
BASE64_PLACEHOLDER = 'data:image/png;base64,iVBORw0KGgo='
def extract_css_variables(style_text):
vars_found = []
for m in re.finditer(r'(--[\w-]+)\s*:\s*([^;{]+?)(?:\s*!important)?;', style_text):
vars_found.append(f' {m.group(1)}: {m.group(2).strip()};')
return vars_found
def strip_html(input_path, output_dir=None):
input_path = Path(input_path)
if output_dir is None:
output_dir = input_path.parent
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
stem = input_path.stem
html_out = output_dir / f'{stem}.stripped.html'
css_out = output_dir / f'{stem}.styles.css'
vars_out = output_dir / f'{stem}.variables.css'
with open(input_path, 'r', encoding='utf-8', errors='replace') as f:
html = f.read()
original_size = len(html)
stats = {
'base64_replaced': 0,
'base64_saved_chars': 0,
'styles_extracted': 0,
'styles_saved_chars': 0,
'css_vars_extracted': 0,
'scripts_removed': 0,
'data_hrefs_fixed': 0,
}
# ── 0. 扫描所有 CSS 变量(先于任何替换)────────────
all_vars = []
for m in re.finditer(r'', html, flags=re.DOTALL):
all_vars.extend(extract_css_variables(m.group(1)))
# 也从 style 属性中提取
for m in re.finditer(r'style="([^"]+)"', html):
all_vars.extend(extract_css_variables(m.group(1)))
# ── 1. 全面替换所有 base64 图片 ─────────────────
# 格式1: url(data:image/...) 无引号
def repl_url_nq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f'url("{BASE64_PLACEHOLDER}")'
n = len(re.findall(r'url\((data:image[^\s\)]+)\)', html))
html = re.sub(r'url\((data:image[^\s\)]+)\)', repl_url_nq, html)
# 格式2: url("data:image/...") 双引号
def repl_url_dq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f'url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'url\("(data:image[^"]+)"\)', repl_url_dq, html)
# 格式3: url('data:image/...') 单引号
def repl_url_sq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f"url('{BASE64_PLACEHOLDER}')"
html = re.sub(r"url\('(data:image[^']+)'\)", repl_url_sq, html)
# 格式4: style="background-image:url(data:...)" style属性中无引号
def repl_style_nq(m):
prefix = m.group(1)
data = m.group(2)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(data)
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'(background-image\s*:\s*)url\((data:image[^\s\)]+)\)', repl_style_nq, html, flags=re.IGNORECASE)
# 格式5: style="background-image:url("data:...")" style属性中有引号
def repl_style_dq(m):
prefix = m.group(1)
data = m.group(2)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(data)
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'(background-image\s*:\s*)url\("(data:image[^"]+)"\)', repl_style_dq, html, flags=re.IGNORECASE)
# 格式6:
def repl_img(m):
attrs = m.group(1)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(2))
return f'
]*?)src="(data:image/[^"]+)"', repl_img, html, flags=re.IGNORECASE)
# ── 2. 处理极大的内联 style ──────────────────────
extracted_styles = []
def process_style(m):
tag = m.group(1)
attrs = m.group(2)
content = m.group(3)
if len(content) < STRIP_THRESHOLD_STYLE_CHARS:
return m.group(0)
stats['styles_extracted'] += 1
stats['styles_saved_chars'] += len(content)
all_vars.extend(extract_css_variables(content))
style_id = f'extracted-style-{stats["styles_extracted"]}'
html_tag = f''
extracted_styles.append((style_id, content))
return f''
html = re.sub(r'(', process_style, html, flags=re.DOTALL)
# ── 3. 写外置 CSS ────────────────────────────────
if extracted_styles:
css_parts = ['/* === Extracted inline styles === */', '']
for style_id, content in extracted_styles:
css_parts.append(f'/* --- {style_id} --- */')
css_parts.append(content)
css_parts.append('')
css_out.write_text('\n'.join(css_parts), encoding='utf-8')
# ── 4. 写变量文件 ────────────────────────────────
if all_vars:
seen = set()
unique_vars = []
for v in all_vars:
key = v.strip().split(':')[0]
if key not in seen and key.startswith('--'):
seen.add(key)
unique_vars.append(v)
vars_out.write_text(
'/* === Extracted CSS Variables === */\n:root {\n' +
'\n'.join(unique_vars) +
'\n}\n',
encoding='utf-8'
)
stats['css_vars_extracted'] = len(unique_vars)
# ── 5. 移除 inline script ────────────────────────
stats['scripts_removed'] = len(re.findall(r'', html, flags=re.DOTALL))
html = re.sub(r'', '', html, flags=re.DOTALL)
html = re.sub(r'