diff --git a/benchmark-pprof.yaml b/benchmark-pprof.yaml new file mode 100644 index 0000000..45dc36b --- /dev/null +++ b/benchmark-pprof.yaml @@ -0,0 +1,34 @@ +servers: + - listen: ":18080" + static: + - path: "/" + root: "./testdata" + index: + - "index.html" + proxy: + - path: "/api" + targets: + - url: "http://127.0.0.1:18081" + +monitoring: + pprof: + enabled: true + path: "/debug/pprof" + allow: + - "127.0.0.1" + +performance: + file_cache: + max_entries: 1000 + max_size: 104857600 + inactive: 60s + +logging: + format: "text" + access: + path: "" + format: "json" + sample_rate: 0.1 + error: + path: "" + level: "warn" diff --git a/benchmarks/v0.4.0/REPORT.md b/benchmarks/v0.4.0/REPORT.md index b03a0b0..8389c4f 100644 --- a/benchmarks/v0.4.0/REPORT.md +++ b/benchmarks/v0.4.0/REPORT.md @@ -174,13 +174,71 @@ --- -## 7. 原始数据文件 +## 7. 优化实施结果 -- `benchmarks/v0.4.0/pprof/cpu.prof` — CPU profile -- `benchmarks/v0.4.0/pprof/allocs.prof` — 分配 profile -- `benchmarks/v0.4.0/pprof/heap.prof` — 堆内存 profile -- `benchmarks/v0.4.0/pprof/goroutine.prof` — Goroutine profile -- `benchmarks/v0.4.0/cpu-top.txt` — CPU top 函数 -- `benchmarks/v0.4.0/allocs-top.txt` — 分配 top 函数 +### Task A: 访问日志采样 (accesslog) + +**实现**: +- 新增 `logging.access.sample_rate` 配置(0.0~1.0) +- 5xx 服务器错误始终记录,2xx/3xx/4xx 按采样率记录 +- 使用原子计数器实现无锁、零分配采样 + +**验证** (wrk 4 线程 × 200 连接,静态文件): +- 未优化: `26,474 ns/op` latency, `13,398 B/op` +- 采样 10%: `18,734 ns/op` latency, `4,631 B/op` +- **收益: -29% latency, -65% allocations/op** + +### Task B: 静态文件缓存优化 (handler) + +**实现**: +- `router.go` 始终启用 `FileInfoCache`,TTL 默认 2s +- `FileInfoCache` 支持负缓存(缓存不存在的文件,避免重复 `os.Stat`) +- 修复 `handleStandard` / `handleTryFiles` 中索引文件的 `fileCache` 查找缺失 +- 新增 `tryServeFromFileCache()` 辅助函数统一缓存命中逻辑 + +**验证** (wrk 4 线程 × 200 连接,`/` → `testdata/index.html`): +- 未启用 fileCache: `~140k req/sec`, `~2.6GB alloc_space` +- 启用并修复索引文件缓存后: `~242k req/sec`, `~4.6MB alloc_space` +- **收益: +73% throughput, -99.8% alloc_space** + +### Task C: RemoteAddr 字符串缓存 (netutil/logging/variable) + +**实现**: +- 新增 `netutil.FormatRemoteAddr()`,优先使用 `ctx.RemoteIP()` +- IPv4 走零分配快速路径(手写 uint8 → ASCII) +- IPv6 回退到 `addr.String()`,使用 1024 条目 LRU 缓存 +- `logging.LogAccess` 和 `variable.$remote_addr/$remote_port` 统一使用 + +**效果**: +- 消除了 `net.JoinHostPort` 和 `net.IP.String` 在访问日志热路径的分配 +- 配合访问日志采样后,`LogAccess` 相关分配从 top 10 中消失 + +### 综合对比 + +| 指标 | 优化前 | 优化后 | 变化 | +|------|--------|--------|------| +| 静态文件 RPS | ~140k | **~242k** | **+73%** | +| 静态文件 allocs | ~2.6 GB | **~4.6 MB** | **-99.8%** | +| 访问日志 latency | 26.5 μs | 18.7 μs | -29% | +| 访问日志 allocs | 13.4 KB/op | 4.6 KB/op | -65% | +| CPU 热点 LogAccess | 16.36% cum | 未进入 top 10 | 消除 | +| 内存热点 os.statNolog | 74.95% flat | 未出现 | 消除 | + +## 8. 剩余优化机会 + +- **bufio.Reader/Writer 池化**: heap 中仍占主导,代理路径可优化 +- **连接池调优**: `net.Dialer.DialContext` 在代理路径仍有分配 +- **系统调用基线**: syscall 仍占 60%+ CPU,io_uring 可进一步挖掘 + +## 9. 原始数据文件 + +- `benchmarks/v0.4.0/pprof/v2/cpu-final.prof` — 优化后 CPU profile +- `benchmarks/v0.4.0/pprof/v2/allocs-final.prof` — 优化后分配 profile +- `benchmarks/v0.4.0/cpu-top-final.txt` — 优化后 CPU top 函数 +- `benchmarks/v0.4.0/allocs-top-final.txt` — 优化后分配 top 函数 +- `benchmarks/v0.4.0/pprof/cpu.prof` — 原始 CPU profile(保留) +- `benchmarks/v0.4.0/pprof/allocs.prof` — 原始分配 profile(保留) +- `benchmarks/v0.4.0/cpu-top.txt` — 原始 CPU top 函数 +- `benchmarks/v0.4.0/allocs-top.txt` — 原始分配 top 函数 - `benchmarks/v0.4.0/heap-top.txt` — 堆内存 top 函数 - `benchmarks/v0.4.0/summary.txt` — 基准测试汇总 diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh new file mode 100755 index 0000000..ff81a7c --- /dev/null +++ b/scripts/bench-compare.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# bench-compare.sh — 比较两次基准测试结果,检测性能回归。 +# +# 用法: +# ./scripts/bench-compare.sh +# +# 返回码: +# 0 — 无显著回归 +# 1 — 检测到显著回归(默认阈值: latency +10%, allocs +10%) +# +# 示例: +# ./scripts/bench-compare.sh benchmarks/v0.4.0/summary.txt benchmarks/v0.5.0/summary.txt + +set -euo pipefail + +OLD="${1:-}" +NEW="${2:-}" + +if [[ -z "$OLD" || -z "$NEW" ]]; then + echo "用法: $0 " >&2 + exit 2 +fi + +if [[ ! -f "$OLD" ]]; then + echo "错误: 找不到旧摘要文件: $OLD" >&2 + exit 2 +fi + +if [[ ! -f "$NEW" ]]; then + echo "错误: 找不到新摘要文件: $NEW" >&2 + exit 2 +fi + +LATENCY_THRESH="${LATENCY_THRESH:-10.0}" +ALLOCS_THRESH="${ALLOCS_THRESH:-10.0}" +RPS_THRESH="${RPS_THRESH:--10.0}" + +echo "=== 基准比较 ===" +printf "%-40s %12s %12s %12s\n" "Benchmark" "Old" "New" "Change%" +echo "----------------------------------------------------------------------" + +REGRESSION=0 + +# 提取并比较关键微基准(ns/op 和 B/op) +# 格式: BenchmarkName-NN 1234 ns/op 567 B/op 890 allocs/op +compare_metric() { + local bench="$1" + local metric="$2" + local thresh="$3" + local better_is_lower="${4:-1}" + + local old_val new_val + old_val=$(grep -E "^${bench}" "$OLD" | grep -oE "[0-9]+(\.[0-9]+)?[[:space:]]*${metric}" | head -1 | awk '{print $1}') + new_val=$(grep -E "^${bench}" "$NEW" | grep -oE "[0-9]+(\.[0-9]+)?[[:space:]]*${metric}" | head -1 | awk '{print $1}') + + if [[ -z "$old_val" || -z "$new_val" ]]; then + return 0 + fi + + if awk -v o="$old_val" 'BEGIN { exit (o == 0) ? 0 : 1 }'; then + printf "%-40s %12s %12s %11s%%\n" "$bench ($metric)" "$old_val" "$new_val" "N/A" + return 0 + fi + + local change + change=$(awk -v o="$old_val" -v n="$new_val" 'BEGIN { printf "%.2f", ((n - o) / o) * 100 }') + local abs_change + abs_change=$(awk -v c="$change" 'BEGIN { printf "%.2f", c < 0 ? -c : c }') + + printf "%-40s %12s %12s %11s%%\n" "$bench ($metric)" "$old_val" "$new_val" "$change" + + if awk -v c="$abs_change" -v t="$thresh" 'BEGIN { exit (c > t) ? 0 : 1 }'; then + if [[ "$better_is_lower" == "1" && $(awk -v c="$change" 'BEGIN { print (c > 0) ? 1 : 0 }') -eq 1 ]]; then + echo " ⚠️ 回归警告: $bench $metric 增加 ${change}% (阈值 ${thresh}%)" >&2 + REGRESSION=1 + elif [[ "$better_is_lower" == "0" && $(awk -v c="$change" 'BEGIN { print (c < 0) ? 1 : 0 }') -eq 1 ]]; then + echo " ⚠️ 回归警告: $bench $metric 降低 ${change}% (阈值 ${thresh}%)" >&2 + REGRESSION=1 + fi + fi +} + +# 关键基准测试前缀列表(前缀匹配) +BENCHMARKS=( + "BenchmarkAccessLogProcess" + "BenchmarkFileCacheGet" + "BenchmarkProxyCacheGet" + "BenchmarkStaticFile" + "BenchmarkStaticIndex" + "BenchmarkStaticTryFiles" + "BenchmarkProxyForward" + "BenchmarkProxyHostClient" + "BenchmarkProxyWithMockBackend" + "BenchmarkMiddlewareProcessChain" + "BenchmarkMiddlewareChainExecution" + "BenchmarkCompressionMiddleware" + "BenchmarkDNSResolverLookupWithCache" +) + +for bench in "${BENCHMARKS[@]}"; do + compare_metric "$bench" "ns/op" "$LATENCY_THRESH" 1 + compare_metric "$bench" "B/op" "$ALLOCS_THRESH" 1 +done + +echo "" +if [[ "$REGRESSION" -eq 0 ]]; then + echo "✅ 未检测到显著性能回归" + exit 0 +else + echo "❌ 检测到性能回归,请检查上述警告" + exit 1 +fi