diff --git a/Makefile b/Makefile
index 7daf55a..92cb9c9 100644
--- a/Makefile
+++ b/Makefile
@@ -158,6 +158,18 @@ test-e2e:
 	@echo "Running L3 E2E tests (requires Docker)..."
 	go test -v -tags=e2e ./internal/e2e/...
 
+# 运行 L3 E2E 测试（带覆盖率）
+test-e2e-cover:
+	@echo "Running L3 E2E tests with coverage..."
+	go test -tags=e2e -coverprofile=e2e-coverage.out -coverpkg=./... ./internal/e2e/...
+	go tool cover -html=e2e-coverage.out -o e2e-coverage.html
+	@echo "E2E coverage report: e2e-coverage.html"
+
+# 运行 L3 E2E 测试（短模式，仅运行工具测试）
+test-e2e-short:
+	@echo "Running L3 E2E tests (short mode - testutil only)..."
+	go test -tags=e2e -short -v ./internal/e2e/testutil/... -timeout 60s
+
 # 运行所有测试（单元 + 集成 + E2E）
 test-all: test test-integration test-e2e
 	@echo "All tests passed."
@@ -364,6 +376,11 @@ help:
 	@echo "Testing:"
 	@echo "  make test           - Run all tests"
 	@echo "  make test-cover     - Run tests with coverage"
+	@echo "  make test-integration - Run L2 integration tests"
+	@echo "  make test-e2e       - Run L3 E2E tests (requires Docker)"
+	@echo "  make test-e2e-cover - Run E2E tests with coverage"
+	@echo "  make test-e2e-short - Run E2E tests (short mode)"
+	@echo "  make test-all       - Run all tests (unit + integration + E2E)"
 	@echo "  make act            - Run CI locally with act"
 	@echo "  make act-unit       - Run unit tests job with act"
 	@echo "  make bench          - Run benchmarks"
diff --git a/internal/e2e/healthcheck_e2e_test.go b/internal/e2e/healthcheck_e2e_test.go
index 9c306fc..de370a4 100644
--- a/internal/e2e/healthcheck_e2e_test.go
+++ b/internal/e2e/healthcheck_e2e_test.go
@@ -9,6 +9,7 @@ package e2e
 
 import (
 	"context"
+	"fmt"
 	"io"
 	"net/http"
 	"testing"
@@ -76,8 +77,23 @@ func TestE2EHealthCheckActive(t *testing.T) {
 
 	t.Log("Backend 0 terminated, waiting for health check to detect...")
 
-	// 等待健康检查检测到故障
-	time.Sleep(10 * time.Second)
+	// 等待健康检查检测到故障（使用重试机制）
+	err = testutil.WaitForNoError(ctx, testutil.RetryConfig{
+		Interval: 1 * time.Second,
+		Timeout:  15 * time.Second,
+	}, func() error {
+		// 发送请求验证故障转移
+		resp, err := client.Get(lolly.HTTPBaseURL())
+		if err != nil {
+			return err
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode != 200 {
+			return fmt.Errorf("unexpected status: %d", resp.StatusCode)
+		}
+		return nil
+	})
+	require.NoError(t, err, "Health check should detect failure and route to healthy backend")
 
 	// 继续发送请求，应该仍然成功（路由到健康后端）
 	successCount := 0
@@ -205,8 +221,8 @@ func TestE2EHealthCheckRecovery(t *testing.T) {
 
 	t.Log("Backend terminated, waiting for health check...")
 
-	// 等待健康检查
-	time.Sleep(10 * time.Second)
+	// 等待健康检查检测到故障
+	time.Sleep(5 * time.Second)
 
 	// 恢复后端
 	err = pool.RestartOne(ctx, 0)
@@ -214,8 +230,22 @@ func TestE2EHealthCheckRecovery(t *testing.T) {
 
 	t.Log("Backend restarted, waiting for recovery detection...")
 
-	// 等待健康检查检测到恢复
-	time.Sleep(10 * time.Second)
+	// 等待健康检查检测到恢复（使用重试机制）
+	err = testutil.WaitForNoError(ctx, testutil.RetryConfig{
+		Interval: 1 * time.Second,
+		Timeout:  15 * time.Second,
+	}, func() error {
+		resp, err := client.Get(lolly.HTTPBaseURL())
+		if err != nil {
+			return err
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode != 200 {
+			return fmt.Errorf("unexpected status: %d", resp.StatusCode)
+		}
+		return nil
+	})
+	require.NoError(t, err, "Backend should recover and accept requests")
 
 	// 发送请求验证恢复
 	successCount := 0
@@ -430,8 +460,22 @@ func TestE2EHealthCheckMultipleBackends(t *testing.T) {
 
 	t.Log("Backend 1 terminated")
 
-	// 等待健康检查
-	time.Sleep(10 * time.Second)
+	// 等待健康检查检测到故障（使用重试机制）
+	err = testutil.WaitForNoError(ctx, testutil.RetryConfig{
+		Interval: 1 * time.Second,
+		Timeout:  15 * time.Second,
+	}, func() error {
+		resp, err := client.Get(lolly.HTTPBaseURL())
+		if err != nil {
+			return err
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode != 200 {
+			return fmt.Errorf("unexpected status: %d", resp.StatusCode)
+		}
+		return nil
+	})
+	require.NoError(t, err, "Health check should detect failure and route to remaining backends")
 
 	// 继续发送请求
 	successCount := 0
diff --git a/internal/e2e/loadbalance_e2e_test.go b/internal/e2e/loadbalance_e2e_test.go
index b4af701..8fe5c76 100644
--- a/internal/e2e/loadbalance_e2e_test.go
+++ b/internal/e2e/loadbalance_e2e_test.go
@@ -9,6 +9,7 @@ package e2e
 
 import (
 	"context"
+	"fmt"
 	"io"
 	"net/http"
 	"testing"
@@ -310,23 +311,26 @@ func TestE2ELoadBalanceFailover(t *testing.T) {
 	err = pool.TerminateOne(ctx, 0)
 	require.NoError(t, err, "Failed to terminate backend")
 
-	// 等待健康检查检测到故障
-	time.Sleep(2 * time.Second)
-
-	// 继续发送请求，应该仍然成功（故障转移到另一个后端）
-	for i := 0; i < 5; i++ {
+	// 等待故障转移（使用重试机制）
+	err = testutil.WaitForNoError(ctx, testutil.RetryConfig{
+		Interval: 500 * time.Millisecond,
+		Timeout:  5 * time.Second,
+	}, func() error {
 		resp, err := client.Get(lolly.HTTPBaseURL())
-		if err == nil {
-			resp.Body.Close()
-			if resp.StatusCode == 200 {
-				t.Logf("Request %d succeeded after failover", i)
-				return
-			}
+		if err != nil {
+			return err
 		}
-		time.Sleep(500 * time.Millisecond)
+		defer resp.Body.Close()
+		if resp.StatusCode != 200 {
+			return fmt.Errorf("unexpected status: %d", resp.StatusCode)
+		}
+		return nil
+	})
+	if err == nil {
+		t.Log("Failover succeeded")
+	} else {
+		t.Logf("Failover test completed with error: %v", err)
 	}
-
-	t.Log("Failover test completed")
 }
 
 // TestE2ELoadBalanceHealthCheck 测试健康检查与负载均衡集成。
diff --git a/internal/e2e/testutil/container.go b/internal/e2e/testutil/container.go
index 27e6833..463af9e 100644
--- a/internal/e2e/testutil/container.go
+++ b/internal/e2e/testutil/container.go
@@ -10,6 +10,7 @@ package testutil
 import (
 	"context"
 	"fmt"
+	"io"
 	"net/http"
 	"strings"
 	"time"
@@ -300,6 +301,28 @@ func (c *LollyContainer) WaitForHealthy(ctx context.Context, timeout time.Durati
 	return fmt.Errorf("service not healthy after %v", timeout)
 }
 
+// Logs 获取容器日志。
+//
+// 用于诊断测试失败原因。
+func (c *LollyContainer) Logs(ctx context.Context) (string, error) {
+	if c.Container == nil {
+		return "", fmt.Errorf("container is nil")
+	}
+
+	reader, err := c.Container.Logs(ctx)
+	if err != nil {
+		return "", fmt.Errorf("failed to get container logs: %w", err)
+	}
+	defer reader.Close()
+
+	data, err := io.ReadAll(reader)
+	if err != nil {
+		return "", fmt.Errorf("failed to read container logs: %w", err)
+	}
+
+	return string(data), nil
+}
+
 // MockBackendContainer 启动一个模拟后端服务器容器。
 func MockBackendContainer(ctx context.Context, port int) (testcontainers.Container, string, error) {
 	req := testcontainers.ContainerRequest{
diff --git a/internal/e2e/testutil/retry.go b/internal/e2e/testutil/retry.go
new file mode 100644
index 0000000..fd78eef
--- /dev/null
+++ b/internal/e2e/testutil/retry.go
@@ -0,0 +1,230 @@
+//go:build e2e
+
+// Package testutil 提供 E2E 测试的工具函数。
+//
+// 包含重试和等待工具，提高测试稳定性。
+//
+// 作者：xfy
+package testutil
+
+import (
+	"context"
+	"fmt"
+	"time"
+)
+
+// RetryConfig 重试配置。
+type RetryConfig struct {
+	// Interval 重试间隔
+	Interval time.Duration
+	// Timeout 总超时时间
+	Timeout time.Duration
+	// MaxRetries 最大重试次数（0 表示无限制）
+	MaxRetries int
+}
+
+// DefaultRetryConfig 默认重试配置。
+var DefaultRetryConfig = RetryConfig{
+	Interval:   500 * time.Millisecond,
+	Timeout:    30 * time.Second,
+	MaxRetries: 0, // 无限制
+}
+
+// WaitForCondition 等待条件满足。
+//
+// 定期检查条件函数，直到返回 true 或超时。
+// 使用默认配置，可通过 opts 覆盖。
+//
+// 使用示例：
+//
+//	err := testutil.WaitForCondition(ctx, testutil.RetryConfig{
+//	    Interval: 1 * time.Second,
+//	    Timeout:  30 * time.Second,
+//	}, func() bool {
+//	    resp, err := client.Get(url)
+//	    if err != nil {
+//	        return false
+//	    }
+//	    defer resp.Body.Close()
+//	    return resp.StatusCode == 200
+//	})
+func WaitForCondition(ctx context.Context, cfg RetryConfig, condition func() bool) error {
+	if cfg.Interval <= 0 {
+		cfg.Interval = DefaultRetryConfig.Interval
+	}
+	if cfg.Timeout <= 0 {
+		cfg.Timeout = DefaultRetryConfig.Timeout
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, cfg.Timeout)
+	defer cancel()
+
+	ticker := time.NewTicker(cfg.Interval)
+	defer ticker.Stop()
+
+	retries := 0
+	for {
+		select {
+		case <-ctx.Done():
+			return fmt.Errorf("condition not met after %v: %w", cfg.Timeout, ctx.Err())
+		case <-ticker.C:
+			if condition() {
+				return nil
+			}
+			retries++
+			if cfg.MaxRetries > 0 && retries >= cfg.MaxRetries {
+				return fmt.Errorf("condition not met after %d retries", retries)
+			}
+		}
+	}
+}
+
+// WaitForNoError 等待操作无错误。
+//
+// 定期执行函数，直到返回 nil 或超时。
+// 适用于需要等待某个操作成功的场景。
+//
+// 使用示例：
+//
+//	err := testutil.WaitForNoError(ctx, testutil.RetryConfig{
+//	    Interval: 2 * time.Second,
+//	    Timeout:  60 * time.Second,
+//	}, func() error {
+//	    resp, err := client.Get(url)
+//	    if err != nil {
+//	        return err
+//	    }
+//	    defer resp.Body.Close()
+//	    if resp.StatusCode != 200 {
+//	        return fmt.Errorf("unexpected status: %d", resp.StatusCode)
+//	    }
+//	    return nil
+//	})
+func WaitForNoError(ctx context.Context, cfg RetryConfig, fn func() error) error {
+	if cfg.Interval <= 0 {
+		cfg.Interval = DefaultRetryConfig.Interval
+	}
+	if cfg.Timeout <= 0 {
+		cfg.Timeout = DefaultRetryConfig.Timeout
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, cfg.Timeout)
+	defer cancel()
+
+	ticker := time.NewTicker(cfg.Interval)
+	defer ticker.Stop()
+
+	retries := 0
+	var lastErr error
+	for {
+		select {
+		case <-ctx.Done():
+			if lastErr != nil {
+				return fmt.Errorf("operation failed after %v: %w (last error: %v)", cfg.Timeout, ctx.Err(), lastErr)
+			}
+			return fmt.Errorf("operation failed after %v: %w", cfg.Timeout, ctx.Err())
+		case <-ticker.C:
+			if err := fn(); err == nil {
+				return nil
+			} else {
+				lastErr = err
+			}
+			retries++
+			if cfg.MaxRetries > 0 && retries >= cfg.MaxRetries {
+				if lastErr != nil {
+					return fmt.Errorf("operation failed after %d retries: %w", retries, lastErr)
+				}
+				return fmt.Errorf("operation failed after %d retries", retries)
+			}
+		}
+	}
+}
+
+// Retry 重试操作直到成功或超时。
+//
+// 与 WaitForNoError 类似，但返回最后一次错误。
+// 适用于需要知道具体失败原因的场景。
+func Retry(ctx context.Context, cfg RetryConfig, fn func() error) error {
+	return WaitForNoError(ctx, cfg, fn)
+}
+
+// WaitForHealthy 等待服务健康。
+//
+// 便捷函数，等待 HTTP 服务返回 200 或预期状态码。
+//
+// 使用示例：
+//
+//	err := testutil.WaitForHealthy(ctx, lolly.HTTPBaseURL(), 30*time.Second, 200, 404)
+func WaitForHealthy(ctx context.Context, url string, timeout time.Duration, expectedCodes ...int) error {
+	cfg := RetryConfig{
+		Interval: 500 * time.Millisecond,
+		Timeout:  timeout,
+	}
+
+	if len(expectedCodes) == 0 {
+		expectedCodes = []int{200}
+	}
+
+	return WaitForNoError(ctx, cfg, func() error {
+		client := CreateDefaultHTTPClient()
+		resp, err := client.Get(url)
+		if err != nil {
+			return fmt.Errorf("request failed: %w", err)
+		}
+		defer resp.Body.Close()
+
+		for _, code := range expectedCodes {
+			if resp.StatusCode == code {
+				return nil
+			}
+		}
+
+		return fmt.Errorf("unexpected status code: %d (expected one of %v)", resp.StatusCode, expectedCodes)
+	})
+}
+
+// WaitForBackendHealthy 等待后端服务健康。
+//
+// 用于等待后端池中的服务就绪。
+func WaitForBackendHealthy(ctx context.Context, urls []string, timeout time.Duration) error {
+	cfg := RetryConfig{
+		Interval: 500 * time.Millisecond,
+		Timeout:  timeout,
+	}
+
+	return WaitForNoError(ctx, cfg, func() error {
+		client := CreateDefaultHTTPClient()
+		for _, url := range urls {
+			resp, err := client.Get(url)
+			if err != nil {
+				return fmt.Errorf("backend %s not reachable: %w", url, err)
+			}
+			resp.Body.Close()
+			if resp.StatusCode != 200 {
+				return fmt.Errorf("backend %s returned status %d", url, resp.StatusCode)
+			}
+		}
+		return nil
+	})
+}
+
+// Poll 定期执行函数直到返回 true。
+//
+// 简化的轮询接口，适用于简单场景。
+func Poll(ctx context.Context, interval, timeout time.Duration, fn func() (bool, error)) error {
+	cfg := RetryConfig{
+		Interval: interval,
+		Timeout:  timeout,
+	}
+
+	return WaitForNoError(ctx, cfg, func() error {
+		done, err := fn()
+		if err != nil {
+			return err
+		}
+		if !done {
+			return fmt.Errorf("poll condition not met")
+		}
+		return nil
+	})
+}
diff --git a/internal/e2e/testutil/retry_test.go b/internal/e2e/testutil/retry_test.go
new file mode 100644
index 0000000..8b3addd
--- /dev/null
+++ b/internal/e2e/testutil/retry_test.go
@@ -0,0 +1,204 @@
+//go:build e2e
+
+package testutil
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// TestWaitForConditionSuccess 测试条件满足场景。
+func TestWaitForConditionSuccess(t *testing.T) {
+	ctx := context.Background()
+
+	count := 0
+	err := WaitForCondition(ctx, RetryConfig{
+		Interval: 10 * time.Millisecond,
+		Timeout:  100 * time.Millisecond,
+	}, func() bool {
+		count++
+		return count >= 3
+	})
+
+	require.NoError(t, err, "Should succeed when condition is met")
+	assert.GreaterOrEqual(t, count, 3, "Should have retried at least 3 times")
+}
+
+// TestWaitForConditionTimeout 测试超时场景。
+func TestWaitForConditionTimeout(t *testing.T) {
+	ctx := context.Background()
+
+	start := time.Now()
+	err := WaitForCondition(ctx, RetryConfig{
+		Interval: 10 * time.Millisecond,
+		Timeout:  50 * time.Millisecond,
+	}, func() bool {
+		return false // 永远不满足
+	})
+
+	elapsed := time.Since(start)
+
+	require.Error(t, err, "Should fail when condition is never met")
+	assert.Contains(t, err.Error(), "condition not met")
+	assert.Less(t, elapsed, 100*time.Millisecond, "Should timeout around the specified duration")
+}
+
+// TestWaitForConditionMaxRetries 测试最大重试次数。
+func TestWaitForConditionMaxRetries(t *testing.T) {
+	ctx := context.Background()
+
+	count := 0
+	err := WaitForCondition(ctx, RetryConfig{
+		Interval:   10 * time.Millisecond,
+		Timeout:    1 * time.Second,
+		MaxRetries: 3,
+	}, func() bool {
+		count++
+		return false
+	})
+
+	require.Error(t, err, "Should fail after max retries")
+	assert.Contains(t, err.Error(), "3 retries")
+	assert.Equal(t, 3, count, "Should have retried exactly 3 times")
+}
+
+// TestWaitForConditionContextCancel 测试上下文取消。
+func TestWaitForConditionContextCancel(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// 50ms 后取消
+	go func() {
+		time.Sleep(50 * time.Millisecond)
+		cancel()
+	}()
+
+	err := WaitForCondition(ctx, RetryConfig{
+		Interval: 10 * time.Millisecond,
+		Timeout:  1 * time.Second,
+	}, func() bool {
+		return false
+	})
+
+	require.Error(t, err, "Should fail when context is cancelled")
+	assert.Contains(t, err.Error(), "context canceled")
+}
+
+// TestWaitForNoErrorSuccess 测试操作成功场景。
+func TestWaitForNoErrorSuccess(t *testing.T) {
+	ctx := context.Background()
+
+	count := 0
+	err := WaitForNoError(ctx, RetryConfig{
+		Interval: 10 * time.Millisecond,
+		Timeout:  100 * time.Millisecond,
+	}, func() error {
+		count++
+		if count < 3 {
+			return errors.New("not ready")
+		}
+		return nil
+	})
+
+	require.NoError(t, err, "Should succeed when operation returns nil")
+	assert.GreaterOrEqual(t, count, 3, "Should have retried at least 3 times")
+}
+
+// TestWaitForNoErrorTimeout 测试操作超时场景。
+func TestWaitForNoErrorTimeout(t *testing.T) {
+	ctx := context.Background()
+
+	err := WaitForNoError(ctx, RetryConfig{
+		Interval: 10 * time.Millisecond,
+		Timeout:  50 * time.Millisecond,
+	}, func() error {
+		return errors.New("always fails")
+	})
+
+	require.Error(t, err, "Should fail when operation always returns error")
+	assert.Contains(t, err.Error(), "operation failed")
+	assert.Contains(t, err.Error(), "always fails")
+}
+
+// TestWaitForNoErrorMaxRetries 测试最大重试次数。
+func TestWaitForNoErrorMaxRetries(t *testing.T) {
+	ctx := context.Background()
+
+	count := 0
+	err := WaitForNoError(ctx, RetryConfig{
+		Interval:   10 * time.Millisecond,
+		Timeout:    1 * time.Second,
+		MaxRetries: 2,
+	}, func() error {
+		count++
+		return errors.New("fail")
+	})
+
+	require.Error(t, err, "Should fail after max retries")
+	assert.Contains(t, err.Error(), "2 retries")
+	assert.Equal(t, 2, count, "Should have retried exactly 2 times")
+}
+
+// TestDefaultRetryConfig 测试默认配置。
+func TestDefaultRetryConfig(t *testing.T) {
+	assert.Equal(t, 500*time.Millisecond, DefaultRetryConfig.Interval)
+	assert.Equal(t, 30*time.Second, DefaultRetryConfig.Timeout)
+	assert.Equal(t, 0, DefaultRetryConfig.MaxRetries)
+}
+
+// TestRetryConfigZeroValues 测试零值配置使用默认值。
+func TestRetryConfigZeroValues(t *testing.T) {
+	ctx := context.Background()
+
+	// 零值配置应该使用默认值
+	count := 0
+	err := WaitForCondition(ctx, RetryConfig{}, func() bool {
+		count++
+		return count >= 1
+	})
+
+	require.NoError(t, err, "Should use default config values")
+}
+
+// TestPollSuccess 测试轮询成功。
+func TestPollSuccess(t *testing.T) {
+	ctx := context.Background()
+
+	count := 0
+	err := Poll(ctx, 10*time.Millisecond, 100*time.Millisecond, func() (bool, error) {
+		count++
+		return count >= 3, nil
+	})
+
+	require.NoError(t, err, "Poll should succeed")
+	assert.GreaterOrEqual(t, count, 3)
+}
+
+// TestPollError 测试轮询返回错误。
+func TestPollError(t *testing.T) {
+	ctx := context.Background()
+
+	err := Poll(ctx, 10*time.Millisecond, 50*time.Millisecond, func() (bool, error) {
+		return false, errors.New("poll error")
+	})
+
+	require.Error(t, err, "Poll should fail with error")
+	assert.Contains(t, err.Error(), "poll error")
+}
+
+// TestWaitForHealthySuccess 测试等待健康检查成功。
+func TestWaitForHealthySuccess(t *testing.T) {
+	// 这个测试需要 HTTP 服务器，在集成测试中验证
+	// 这里只测试函数签名和基本逻辑
+	t.Log("WaitForHealthy function exists and has correct signature")
+}
+
+// TestWaitForBackendHealthySuccess 测试等待后端健康。
+func TestWaitForBackendHealthySuccess(t *testing.T) {
+	// 这个测试需要 HTTP 服务器，在集成测试中验证
+	t.Log("WaitForBackendHealthy function exists and has correct signature")
+}