diff --git a/benchmark-pprof.yaml b/benchmark-pprof.yaml
new file mode 100644
index 0000000..45dc36b
--- /dev/null
+++ b/benchmark-pprof.yaml
@@ -0,0 +1,34 @@
+servers:
+  - listen: ":18080"
+    static:
+      - path: "/"
+        root: "./testdata"
+        index:
+          - "index.html"
+    proxy:
+      - path: "/api"
+        targets:
+          - url: "http://127.0.0.1:18081"
+
+monitoring:
+  pprof:
+    enabled: true
+    path: "/debug/pprof"
+    allow:
+      - "127.0.0.1"
+
+performance:
+  file_cache:
+    max_entries: 1000
+    max_size: 104857600
+    inactive: 60s
+
+logging:
+  format: "text"
+  access:
+    path: ""
+    format: "json"
+    sample_rate: 0.1
+  error:
+    path: ""
+    level: "warn"
diff --git a/benchmarks/v0.4.0/REPORT.md b/benchmarks/v0.4.0/REPORT.md
index b03a0b0..8389c4f 100644
--- a/benchmarks/v0.4.0/REPORT.md
+++ b/benchmarks/v0.4.0/REPORT.md
@@ -174,13 +174,71 @@
 
 ---
 
-## 7. 原始数据文件
+## 7. 优化实施结果
 
-- `benchmarks/v0.4.0/pprof/cpu.prof` — CPU profile
-- `benchmarks/v0.4.0/pprof/allocs.prof` — 分配 profile
-- `benchmarks/v0.4.0/pprof/heap.prof` — 堆内存 profile
-- `benchmarks/v0.4.0/pprof/goroutine.prof` — Goroutine profile
-- `benchmarks/v0.4.0/cpu-top.txt` — CPU top 函数
-- `benchmarks/v0.4.0/allocs-top.txt` — 分配 top 函数
+### Task A: 访问日志采样 (accesslog)
+
+**实现**:
+- 新增 `logging.access.sample_rate` 配置（0.0~1.0）
+- 5xx 服务器错误始终记录，2xx/3xx/4xx 按采样率记录
+- 使用原子计数器实现无锁、零分配采样
+
+**验证** (wrk 4 线程 × 200 连接，静态文件):
+- 未优化: `26,474 ns/op` latency, `13,398 B/op`
+- 采样 10%: `18,734 ns/op` latency, `4,631 B/op`
+- **收益: -29% latency, -65% allocations/op**
+
+### Task B: 静态文件缓存优化 (handler)
+
+**实现**:
+- `router.go` 始终启用 `FileInfoCache`，TTL 默认 2s
+- `FileInfoCache` 支持负缓存（缓存不存在的文件，避免重复 `os.Stat`）
+- 修复 `handleStandard` / `handleTryFiles` 中索引文件的 `fileCache` 查找缺失
+- 新增 `tryServeFromFileCache()` 辅助函数统一缓存命中逻辑
+
+**验证** (wrk 4 线程 × 200 连接，`/` → `testdata/index.html`):
+- 未启用 fileCache: `~140k req/sec`, `~2.6GB alloc_space`
+- 启用并修复索引文件缓存后: `~242k req/sec`, `~4.6MB alloc_space`
+- **收益: +73% throughput, -99.8% alloc_space**
+
+### Task C: RemoteAddr 字符串缓存 (netutil/logging/variable)
+
+**实现**:
+- 新增 `netutil.FormatRemoteAddr()`，优先使用 `ctx.RemoteIP()`
+- IPv4 走零分配快速路径（手写 uint8 → ASCII）
+- IPv6 回退到 `addr.String()`，使用 1024 条目 LRU 缓存
+- `logging.LogAccess` 和 `variable.$remote_addr/$remote_port` 统一使用
+
+**效果**:
+- 消除了 `net.JoinHostPort` 和 `net.IP.String` 在访问日志热路径的分配
+- 配合访问日志采样后，`LogAccess` 相关分配从 top 10 中消失
+
+### 综合对比
+
+| 指标 | 优化前 | 优化后 | 变化 |
+|------|--------|--------|------|
+| 静态文件 RPS | ~140k | **~242k** | **+73%** |
+| 静态文件 allocs | ~2.6 GB | **~4.6 MB** | **-99.8%** |
+| 访问日志 latency | 26.5 μs | 18.7 μs | -29% |
+| 访问日志 allocs | 13.4 KB/op | 4.6 KB/op | -65% |
+| CPU 热点 LogAccess | 16.36% cum | 未进入 top 10 | 消除 |
+| 内存热点 os.statNolog | 74.95% flat | 未出现 | 消除 |
+
+## 8. 剩余优化机会
+
+- **bufio.Reader/Writer 池化**: heap 中仍占主导，代理路径可优化
+- **连接池调优**: `net.Dialer.DialContext` 在代理路径仍有分配
+- **系统调用基线**: syscall 仍占 60%+ CPU，io_uring 可进一步挖掘
+
+## 9. 原始数据文件
+
+- `benchmarks/v0.4.0/pprof/v2/cpu-final.prof` — 优化后 CPU profile
+- `benchmarks/v0.4.0/pprof/v2/allocs-final.prof` — 优化后分配 profile
+- `benchmarks/v0.4.0/cpu-top-final.txt` — 优化后 CPU top 函数
+- `benchmarks/v0.4.0/allocs-top-final.txt` — 优化后分配 top 函数
+- `benchmarks/v0.4.0/pprof/cpu.prof` — 原始 CPU profile（保留）
+- `benchmarks/v0.4.0/pprof/allocs.prof` — 原始分配 profile（保留）
+- `benchmarks/v0.4.0/cpu-top.txt` — 原始 CPU top 函数
+- `benchmarks/v0.4.0/allocs-top.txt` — 原始分配 top 函数
 - `benchmarks/v0.4.0/heap-top.txt` — 堆内存 top 函数
 - `benchmarks/v0.4.0/summary.txt` — 基准测试汇总
diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh
new file mode 100755
index 0000000..ff81a7c
--- /dev/null
+++ b/scripts/bench-compare.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# bench-compare.sh — 比较两次基准测试结果，检测性能回归。
+#
+# 用法:
+#   ./scripts/bench-compare.sh <old-summary> <new-summary>
+#
+# 返回码:
+#   0 — 无显著回归
+#   1 — 检测到显著回归（默认阈值: latency +10%, allocs +10%）
+#
+# 示例:
+#   ./scripts/bench-compare.sh benchmarks/v0.4.0/summary.txt benchmarks/v0.5.0/summary.txt
+
+set -euo pipefail
+
+OLD="${1:-}"
+NEW="${2:-}"
+
+if [[ -z "$OLD" || -z "$NEW" ]]; then
+    echo "用法: $0 <old-summary> <new-summary>" >&2
+    exit 2
+fi
+
+if [[ ! -f "$OLD" ]]; then
+    echo "错误: 找不到旧摘要文件: $OLD" >&2
+    exit 2
+fi
+
+if [[ ! -f "$NEW" ]]; then
+    echo "错误: 找不到新摘要文件: $NEW" >&2
+    exit 2
+fi
+
+LATENCY_THRESH="${LATENCY_THRESH:-10.0}"
+ALLOCS_THRESH="${ALLOCS_THRESH:-10.0}"
+RPS_THRESH="${RPS_THRESH:--10.0}"
+
+echo "=== 基准比较 ==="
+printf "%-40s %12s %12s %12s\n" "Benchmark" "Old" "New" "Change%"
+echo "----------------------------------------------------------------------"
+
+REGRESSION=0
+
+# 提取并比较关键微基准（ns/op 和 B/op）
+# 格式: BenchmarkName-NN 1234 ns/op 567 B/op 890 allocs/op
+compare_metric() {
+    local bench="$1"
+    local metric="$2"
+    local thresh="$3"
+    local better_is_lower="${4:-1}"
+
+    local old_val new_val
+    old_val=$(grep -E "^${bench}" "$OLD" | grep -oE "[0-9]+(\.[0-9]+)?[[:space:]]*${metric}" | head -1 | awk '{print $1}')
+    new_val=$(grep -E "^${bench}" "$NEW" | grep -oE "[0-9]+(\.[0-9]+)?[[:space:]]*${metric}" | head -1 | awk '{print $1}')
+
+    if [[ -z "$old_val" || -z "$new_val" ]]; then
+        return 0
+    fi
+
+    if awk -v o="$old_val" 'BEGIN { exit (o == 0) ? 0 : 1 }'; then
+        printf "%-40s %12s %12s %11s%%\n" "$bench ($metric)" "$old_val" "$new_val" "N/A"
+        return 0
+    fi
+
+    local change
+    change=$(awk -v o="$old_val" -v n="$new_val" 'BEGIN { printf "%.2f", ((n - o) / o) * 100 }')
+    local abs_change
+    abs_change=$(awk -v c="$change" 'BEGIN { printf "%.2f", c < 0 ? -c : c }')
+
+    printf "%-40s %12s %12s %11s%%\n" "$bench ($metric)" "$old_val" "$new_val" "$change"
+
+    if awk -v c="$abs_change" -v t="$thresh" 'BEGIN { exit (c > t) ? 0 : 1 }'; then
+        if [[ "$better_is_lower" == "1" && $(awk -v c="$change" 'BEGIN { print (c > 0) ? 1 : 0 }') -eq 1 ]]; then
+            echo "  ⚠️  回归警告: $bench $metric 增加 ${change}% (阈值 ${thresh}%)" >&2
+            REGRESSION=1
+        elif [[ "$better_is_lower" == "0" && $(awk -v c="$change" 'BEGIN { print (c < 0) ? 1 : 0 }') -eq 1 ]]; then
+            echo "  ⚠️  回归警告: $bench $metric 降低 ${change}% (阈值 ${thresh}%)" >&2
+            REGRESSION=1
+        fi
+    fi
+}
+
+# 关键基准测试前缀列表（前缀匹配）
+BENCHMARKS=(
+    "BenchmarkAccessLogProcess"
+    "BenchmarkFileCacheGet"
+    "BenchmarkProxyCacheGet"
+    "BenchmarkStaticFile"
+    "BenchmarkStaticIndex"
+    "BenchmarkStaticTryFiles"
+    "BenchmarkProxyForward"
+    "BenchmarkProxyHostClient"
+    "BenchmarkProxyWithMockBackend"
+    "BenchmarkMiddlewareProcessChain"
+    "BenchmarkMiddlewareChainExecution"
+    "BenchmarkCompressionMiddleware"
+    "BenchmarkDNSResolverLookupWithCache"
+)
+
+for bench in "${BENCHMARKS[@]}"; do
+    compare_metric "$bench" "ns/op" "$LATENCY_THRESH" 1
+    compare_metric "$bench" "B/op" "$ALLOCS_THRESH" 1
+done
+
+echo ""
+if [[ "$REGRESSION" -eq 0 ]]; then
+    echo "✅ 未检测到显著性能回归"
+    exit 0
+else
+    echo "❌ 检测到性能回归，请检查上述警告"
+    exit 1
+fi