vr-shopxo-plugin/strip_page.py

206 lines
8.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
"""
strip_page.py - HTML 脱脂脚本 v2
修复:全面处理所有格式的 base64 图片(无引号/双引号/单引号/url()/style属性
"""
import re
import os
import sys
from pathlib import Path
STRIP_THRESHOLD_STYLE_CHARS = 50000
BASE64_PLACEHOLDER = 'data:image/png;base64,iVBORw0KGgo='
def extract_css_variables(style_text):
vars_found = []
for m in re.finditer(r'(--[\w-]+)\s*:\s*([^;{]+?)(?:\s*!important)?;', style_text):
vars_found.append(f' {m.group(1)}: {m.group(2).strip()};')
return vars_found
def strip_html(input_path, output_dir=None):
input_path = Path(input_path)
if output_dir is None:
output_dir = input_path.parent
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
stem = input_path.stem
html_out = output_dir / f'{stem}.stripped.html'
css_out = output_dir / f'{stem}.styles.css'
vars_out = output_dir / f'{stem}.variables.css'
with open(input_path, 'r', encoding='utf-8', errors='replace') as f:
html = f.read()
original_size = len(html)
stats = {
'base64_replaced': 0,
'base64_saved_chars': 0,
'styles_extracted': 0,
'styles_saved_chars': 0,
'css_vars_extracted': 0,
'scripts_removed': 0,
'data_hrefs_fixed': 0,
}
# ── 0. 扫描所有 CSS 变量(先于任何替换)────────────
all_vars = []
for m in re.finditer(r'<style[^>]*>(.*?)</style>', html, flags=re.DOTALL):
all_vars.extend(extract_css_variables(m.group(1)))
# 也从 style 属性中提取
for m in re.finditer(r'style="([^"]+)"', html):
all_vars.extend(extract_css_variables(m.group(1)))
# ── 1. 全面替换所有 base64 图片 ─────────────────
# 格式1: url(data:image/...) 无引号
def repl_url_nq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f'url("{BASE64_PLACEHOLDER}")'
n = len(re.findall(r'url\((data:image[^\s\)]+)\)', html))
html = re.sub(r'url\((data:image[^\s\)]+)\)', repl_url_nq, html)
# 格式2: url("data:image/...") 双引号
def repl_url_dq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f'url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'url\("(data:image[^"]+)"\)', repl_url_dq, html)
# 格式3: url('data:image/...') 单引号
def repl_url_sq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f"url('{BASE64_PLACEHOLDER}')"
html = re.sub(r"url\('(data:image[^']+)'\)", repl_url_sq, html)
# 格式4: style="background-image:url(data:...)" style属性中无引号
def repl_style_nq(m):
prefix = m.group(1)
data = m.group(2)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(data)
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'(background-image\s*:\s*)url\((data:image[^\s\)]+)\)', repl_style_nq, html, flags=re.IGNORECASE)
# 格式5: style="background-image:url("data:...")" style属性中有引号
def repl_style_dq(m):
prefix = m.group(1)
data = m.group(2)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(data)
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'(background-image\s*:\s*)url\("(data:image[^"]+)"\)', repl_style_dq, html, flags=re.IGNORECASE)
# 格式6: <img src="data:image/...">
def repl_img(m):
attrs = m.group(1)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(2))
return f'<img{attrs}src="{BASE64_PLACEHOLDER}"'
html = re.sub(r'<img([^>]*?)src="(data:image/[^"]+)"', repl_img, html, flags=re.IGNORECASE)
# ── 2. 处理极大的内联 style ──────────────────────
extracted_styles = []
def process_style(m):
tag = m.group(1)
attrs = m.group(2)
content = m.group(3)
if len(content) < STRIP_THRESHOLD_STYLE_CHARS:
return m.group(0)
stats['styles_extracted'] += 1
stats['styles_saved_chars'] += len(content)
all_vars.extend(extract_css_variables(content))
style_id = f'extracted-style-{stats["styles_extracted"]}'
html_tag = f'<link rel="stylesheet" id="{style_id}" href="{stem}.styles.css">'
extracted_styles.append((style_id, content))
return f'<!-- {style_id} moved to external CSS -->'
html = re.sub(r'(<style)([^>]*?)>(.*?)</style>', process_style, html, flags=re.DOTALL)
# ── 3. 写外置 CSS ────────────────────────────────
if extracted_styles:
css_parts = ['/* === Extracted inline styles === */', '']
for style_id, content in extracted_styles:
css_parts.append(f'/* --- {style_id} --- */')
css_parts.append(content)
css_parts.append('')
css_out.write_text('\n'.join(css_parts), encoding='utf-8')
# ── 4. 写变量文件 ────────────────────────────────
if all_vars:
seen = set()
unique_vars = []
for v in all_vars:
key = v.strip().split(':')[0]
if key not in seen and key.startswith('--'):
seen.add(key)
unique_vars.append(v)
vars_out.write_text(
'/* === Extracted CSS Variables === */\n:root {\n' +
'\n'.join(unique_vars) +
'\n}\n',
encoding='utf-8'
)
stats['css_vars_extracted'] = len(unique_vars)
# ── 5. 移除 inline script ────────────────────────
stats['scripts_removed'] = len(re.findall(r'<script[^>]*>.*?</script>', html, flags=re.DOTALL))
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
html = re.sub(r'<script[^>]*/>\s*', '', html)
# ── 6. 清理 data: href ──────────────────────────
stats['data_hrefs_fixed'] = len(re.findall(r'href="data:[^"]+"', html))
html = re.sub(r'href="(data:[^"]+)"', 'href="#data-link-removed"', html)
# ── 7. 写脱脂 HTML ──────────────────────────────
html = re.sub(r'<!--\s*-->', '', html)
if extracted_styles and '<link rel="stylesheet"' not in html:
html = html.replace('<head>', f'<head>\n <link rel="stylesheet" href="{stem}.styles.css">', 1)
html_out.write_text(html, encoding='utf-8')
new_html_size = len(html)
css_size = css_out.stat().st_size if css_out.exists() else 0
vars_size = vars_out.stat().st_size if vars_out.exists() else 0
saved = original_size - new_html_size - css_size
print(f'''
✅ 脱脂完成!
{'='*54}
📄 输入: {input_path.name}
原始大小: {original_size/1024:.1f} KB ({original_size:,} 字符)
{'='*54}
📊 处理结果:
base64 图片替换: {stats["base64_replaced"]}
base64 节省: {stats["base64_saved_chars"]/1024:.1f} KB
style 块提取: {stats["styles_extracted"]}
style 节省: {stats["styles_saved_chars"]/1024:.1f} KB
CSS 变量提取: {stats["css_vars_extracted"]}
script 移除: {stats["scripts_removed"]}
data href 修复: {stats["data_hrefs_fixed"]}
{'='*54}
📦 输出文件:
HTML (脱脂): {html_out.name} ({new_html_size/1024:.1f} KB)
CSS (提取样式): {css_out.name} ({css_size/1024:.1f} KB)
CSS (变量): {vars_out.name} ({vars_size/1024:.1f} KB)
{'='*54}
💾 HTML 体积压缩: {new_html_size/1024:.1f} KB (原始 {original_size/1024:.1f} KB)
节省比例: {(saved)/original_size*100:.1f}%
''')
if __name__ == '__main__':
if len(sys.argv) < 2:
print('用法: python3 strip_page.py <input.html> [output_dir]')
sys.exit(1)
strip_html(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)