From fe025e8af3014a1540cd235b949fe87b85e122f6 Mon Sep 17 00:00:00 2001 From: Rick Sanchez <101627789+m4tinbeigi-official@users.noreply.github.com> Date: Wed, 24 Jun 2026 23:31:37 +0330 Subject: [PATCH] feat(xray): add tunnel health monitor (#5480) * feat(xray): add tunnel health monitor * fix(tunnelmonitor): reuse netproxy client and init logger in tests Replace the duplicated newHTTPClient/dialContextWithProxy with netproxy.NewHTTPClient, which centralises the http/https/socks5 handling and avoids the dial-goroutine connection leak on context cancellation. Cap failures at the threshold during cooldown so the counter stays a true consecutive-failure count. Add TestMain to initialise the logger and fix the nil-pointer panic in the success-after-failure path. * fix(tunnelmonitor): observable recovery, signal headroom, and hardening Address the remaining review findings on the tunnel health monitor: - Recovery is now synchronous and observable: the callback calls server.RestartXray() directly and returns its error instead of just enqueuing SIGUSR1, so a failed restart no longer masks as success and arms the cooldown while the tunnel is still down. - Give the OS signal channel headroom (buffer 8) so producers cannot starve a SIGTERM/SIGINT out of the single slot. - Warn at startup when the monitor is enabled without a proxy, since the probe then measures host connectivity rather than the xray tunnel. - Cap failures at the threshold in the nil-recover branch too, matching the cooldown cap. - Document the XUI_TUNNEL_HEALTH_* vars in .env.example and the README. - Add tests for status-code classification, Normalize bounds, New proxy scheme errors, the recovery-error and nil-recover paths, the cooldown cap, and Run context cancellation (coverage 90%). --------- Co-authored-by: Sanaei --- .env.example | 13 + README.md | 7 + internal/tunnelmonitor/monitor.go | 271 +++++++++++++++ internal/tunnelmonitor/monitor_test.go | 454 +++++++++++++++++++++++++ main.go | 29 +- 5 files changed, 773 insertions(+), 1 deletion(-) create mode 100644 internal/tunnelmonitor/monitor.go create mode 100644 internal/tunnelmonitor/monitor_test.go diff --git a/.env.example b/.env.example index ab000be2a..8c8f02429 100644 --- a/.env.example +++ b/.env.example @@ -4,3 +4,16 @@ XUI_LOG_FOLDER=x-ui XUI_BIN_FOLDER=x-ui XUI_INIT_WEB_BASE_PATH=/ # XUI_PORT=8080 + +# Optional tunnel health monitor (disabled by default). It periodically probes a +# URL and restarts xray-core after repeated failures. Point XUI_TUNNEL_HEALTH_PROXY +# at a local xray inbound so the probe tests the tunnel; without it the probe only +# checks host connectivity and a restart will not fix host network issues. A restart +# drops every connected client. +# XUI_TUNNEL_HEALTH_MONITOR=true +# XUI_TUNNEL_HEALTH_PROXY=socks5://127.0.0.1:1080 +# XUI_TUNNEL_HEALTH_URL=https://www.cloudflare.com/cdn-cgi/trace +# XUI_TUNNEL_HEALTH_INTERVAL=30s +# XUI_TUNNEL_HEALTH_TIMEOUT=10s +# XUI_TUNNEL_HEALTH_FAILURES=3 +# XUI_TUNNEL_HEALTH_COOLDOWN=5m diff --git a/README.md b/README.md index 8c6d845ef..1d652c7e0 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,13 @@ docker run -d --cap-add=NET_ADMIN --cap-add=NET_RAW ... ghcr.io/mhsanaei/3x-ui | `XUI_ENABLE_FAIL2BAN` | Enable Fail2ban-based IP-limit enforcement | `true` | | `XUI_LOG_LEVEL` | Log verbosity (`debug`, `info`, `warning`, `error`) | `info` | | `XUI_DEBUG` | Enable debug mode | `false` | +| `XUI_TUNNEL_HEALTH_MONITOR` | Enable the tunnel health monitor (probes a URL and restarts xray after repeated failures; a restart drops all clients) | `false` | +| `XUI_TUNNEL_HEALTH_PROXY` | Proxy the probe is sent through; point it at a local xray inbound so the probe tests the tunnel (e.g. `socks5://127.0.0.1:1080`). Empty means the probe only checks host connectivity | — | +| `XUI_TUNNEL_HEALTH_URL` | URL probed for tunnel health | `https://www.cloudflare.com/cdn-cgi/trace` | +| `XUI_TUNNEL_HEALTH_INTERVAL` | Interval between probes | `30s` | +| `XUI_TUNNEL_HEALTH_TIMEOUT` | Per-probe timeout | `10s` | +| `XUI_TUNNEL_HEALTH_FAILURES` | Consecutive failures before a restart is triggered | `3` | +| `XUI_TUNNEL_HEALTH_COOLDOWN` | Minimum delay between consecutive restarts | `5m` | ## Supported Languages diff --git a/internal/tunnelmonitor/monitor.go b/internal/tunnelmonitor/monitor.go new file mode 100644 index 000000000..2c868efd4 --- /dev/null +++ b/internal/tunnelmonitor/monitor.go @@ -0,0 +1,271 @@ +package tunnelmonitor + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "os" + "strconv" + "strings" + "time" + + "github.com/mhsanaei/3x-ui/v3/internal/logger" + "github.com/mhsanaei/3x-ui/v3/internal/util/netproxy" +) + +const ( + defaultHealthURL = "https://www.cloudflare.com/cdn-cgi/trace" + defaultInterval = 30 * time.Second + defaultTimeout = 10 * time.Second + defaultFailureThreshold = 3 + defaultCooldown = 5 * time.Minute +) + +// Config controls the optional tunnel health monitor. +type Config struct { + Enabled bool + URL string + ProxyURL string + Interval time.Duration + Timeout time.Duration + FailureThreshold int + Cooldown time.Duration +} + +// RecoveryFunc performs recovery after the monitor reaches the configured +// failure threshold. The panel wires this to an Xray core restart. +type RecoveryFunc func(context.Context) error + +// Monitor periodically probes a URL and triggers recovery after repeated +// failures. It is intentionally independent from panel settings/UI so it can be +// enabled safely through service environment variables first. +type Monitor struct { + cfg Config + client *http.Client + recover RecoveryFunc + failures int + lastRecovery time.Time + now func() time.Time +} + +// DefaultConfig returns disabled-by-default monitor settings. +func DefaultConfig() Config { + return Config{ + Enabled: false, + URL: defaultHealthURL, + Interval: defaultInterval, + Timeout: defaultTimeout, + FailureThreshold: defaultFailureThreshold, + Cooldown: defaultCooldown, + } +} + +// ConfigFromEnv builds Config from XUI_TUNNEL_HEALTH_* environment variables. +// +// Supported variables: +// - XUI_TUNNEL_HEALTH_MONITOR=true +// - XUI_TUNNEL_HEALTH_URL=https://www.cloudflare.com/cdn-cgi/trace +// - XUI_TUNNEL_HEALTH_PROXY=socks5://127.0.0.1:1080 +// - XUI_TUNNEL_HEALTH_INTERVAL=30s +// - XUI_TUNNEL_HEALTH_TIMEOUT=10s +// - XUI_TUNNEL_HEALTH_FAILURES=3 +// - XUI_TUNNEL_HEALTH_COOLDOWN=5m +func ConfigFromEnv() Config { + cfg := DefaultConfig() + + cfg.Enabled = parseBool(os.Getenv("XUI_TUNNEL_HEALTH_MONITOR")) + cfg.URL = firstNonEmpty(os.Getenv("XUI_TUNNEL_HEALTH_URL"), cfg.URL) + cfg.ProxyURL = strings.TrimSpace(os.Getenv("XUI_TUNNEL_HEALTH_PROXY")) + cfg.Interval = parseDurationEnv("XUI_TUNNEL_HEALTH_INTERVAL", cfg.Interval) + cfg.Timeout = parseDurationEnv("XUI_TUNNEL_HEALTH_TIMEOUT", cfg.Timeout) + cfg.Cooldown = parseDurationEnv("XUI_TUNNEL_HEALTH_COOLDOWN", cfg.Cooldown) + cfg.FailureThreshold = parseIntEnv("XUI_TUNNEL_HEALTH_FAILURES", cfg.FailureThreshold) + + return cfg.Normalize() +} + +// Normalize applies safe bounds and defaults. +func (c Config) Normalize() Config { + if strings.TrimSpace(c.URL) == "" { + c.URL = defaultHealthURL + } + c.URL = strings.TrimSpace(c.URL) + c.ProxyURL = strings.TrimSpace(c.ProxyURL) + + if c.Interval < time.Second { + c.Interval = defaultInterval + } + if c.Timeout < time.Second { + c.Timeout = defaultTimeout + } + if c.FailureThreshold < 1 { + c.FailureThreshold = defaultFailureThreshold + } + if c.Cooldown < time.Second { + c.Cooldown = defaultCooldown + } + + return c +} + +// New creates a monitor with an HTTP client based on cfg. +func New(cfg Config, recover RecoveryFunc) (*Monitor, error) { + cfg = cfg.Normalize() + + client, err := netproxy.NewHTTPClient(cfg.ProxyURL, cfg.Timeout) + if err != nil { + return nil, err + } + + return newWithClient(cfg, client, recover), nil +} + +func newWithClient(cfg Config, client *http.Client, recover RecoveryFunc) *Monitor { + cfg = cfg.Normalize() + if client == nil { + client = &http.Client{Timeout: cfg.Timeout} + } + + return &Monitor{ + cfg: cfg, + client: client, + recover: recover, + now: time.Now, + } +} + +// Run starts the monitor loop until ctx is cancelled. +func (m *Monitor) Run(ctx context.Context) { + if m == nil || !m.cfg.Enabled { + return + } + + logger.Info("Tunnel health monitor enabled: ", m.cfg.URL) + + ticker := time.NewTicker(m.cfg.Interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + logger.Info("Tunnel health monitor stopped") + return + case <-ticker.C: + recovered, err := m.Step(ctx) + if err != nil { + logger.Warning("Tunnel health monitor check failed: ", err) + } + if recovered { + logger.Warning("Tunnel health monitor triggered Xray restart") + } + } + } +} + +// Step performs one probe and maybe triggers recovery. +func (m *Monitor) Step(ctx context.Context) (bool, error) { + if m == nil { + return false, errors.New("nil monitor") + } + + if err := m.probe(ctx); err != nil { + m.failures++ + + if m.failures < m.cfg.FailureThreshold { + return false, fmt.Errorf("probe failed %d/%d: %w", m.failures, m.cfg.FailureThreshold, err) + } + + now := m.now() + if !m.lastRecovery.IsZero() && now.Sub(m.lastRecovery) < m.cfg.Cooldown { + m.failures = m.cfg.FailureThreshold + return false, fmt.Errorf("probe failed %d/%d; recovery cooldown active: %w", m.failures, m.cfg.FailureThreshold, err) + } + + if m.recover == nil { + m.failures = m.cfg.FailureThreshold + return false, errors.New("recovery function is not configured") + } + + if recErr := m.recover(ctx); recErr != nil { + return false, fmt.Errorf("recovery failed after probe error %v: %w", err, recErr) + } + + m.lastRecovery = now + m.failures = 0 + return true, err + } + + if m.failures > 0 { + logger.Info("Tunnel health monitor recovered after successful probe") + } + m.failures = 0 + return false, nil +} + +func (m *Monitor) probe(ctx context.Context) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, m.cfg.URL, nil) + if err != nil { + return err + } + + resp, err := m.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) + + if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusBadRequest { + return fmt.Errorf("unexpected HTTP status %d", resp.StatusCode) + } + + return nil +} + +func parseBool(value string) bool { + switch strings.ToLower(strings.TrimSpace(value)) { + case "1", "true", "yes", "y", "on", "enable", "enabled": + return true + default: + return false + } +} + +func parseDurationEnv(name string, fallback time.Duration) time.Duration { + value := strings.TrimSpace(os.Getenv(name)) + if value == "" { + return fallback + } + + d, err := time.ParseDuration(value) + if err != nil { + return fallback + } + + return d +} + +func parseIntEnv(name string, fallback int) int { + value := strings.TrimSpace(os.Getenv(name)) + if value == "" { + return fallback + } + + n, err := strconv.Atoi(value) + if err != nil { + return fallback + } + + return n +} + +func firstNonEmpty(value string, fallback string) string { + value = strings.TrimSpace(value) + if value == "" { + return fallback + } + return value +} diff --git a/internal/tunnelmonitor/monitor_test.go b/internal/tunnelmonitor/monitor_test.go new file mode 100644 index 000000000..5b4c4ee3d --- /dev/null +++ b/internal/tunnelmonitor/monitor_test.go @@ -0,0 +1,454 @@ +package tunnelmonitor + +import ( + "context" + "errors" + "net/http" + "strings" + "sync" + "testing" + "time" + + "github.com/mhsanaei/3x-ui/v3/internal/logger" + "github.com/op/go-logging" +) + +func TestMain(m *testing.M) { + logger.InitLogger(logging.ERROR) + m.Run() +} + +type roundTripFunc func(*http.Request) (*http.Response, error) + +func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) +} + +func TestMonitorRestartsAfterFailureThreshold(t *testing.T) { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Interval: time.Minute, + Timeout: time.Second, + FailureThreshold: 2, + Cooldown: time.Minute, + } + + client := &http.Client{ + Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + return nil, errors.New("tunnel down") + }), + } + + restarts := 0 + monitor := newWithClient(cfg, client, func(ctx context.Context) error { + restarts++ + return nil + }) + + monitor.now = func() time.Time { + return time.Unix(100, 0) + } + + if recovered, _ := monitor.Step(context.Background()); recovered { + t.Fatal("first failure must not trigger recovery") + } + + if recovered, _ := monitor.Step(context.Background()); !recovered { + t.Fatal("second consecutive failure should trigger recovery") + } + + if restarts != 1 { + t.Fatalf("expected 1 recovery, got %d", restarts) + } +} + +func TestMonitorRespectsRecoveryCooldown(t *testing.T) { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Interval: time.Minute, + Timeout: time.Second, + FailureThreshold: 1, + Cooldown: time.Minute, + } + + client := &http.Client{ + Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + return nil, errors.New("tunnel down") + }), + } + + now := time.Unix(100, 0) + restarts := 0 + + monitor := newWithClient(cfg, client, func(ctx context.Context) error { + restarts++ + return nil + }) + + monitor.now = func() time.Time { + return now + } + + recovered, _ := monitor.Step(context.Background()) + if !recovered { + t.Fatal("first failure should trigger recovery when threshold is 1") + } + + recovered, _ = monitor.Step(context.Background()) + if recovered { + t.Fatal("cooldown should suppress immediate second recovery") + } + + if restarts != 1 { + t.Fatalf("expected 1 recovery during cooldown, got %d", restarts) + } + + now = now.Add(time.Minute + time.Second) + + recovered, _ = monitor.Step(context.Background()) + if !recovered { + t.Fatal("recovery should be allowed after cooldown") + } + + if restarts != 2 { + t.Fatalf("expected 2 recoveries after cooldown, got %d", restarts) + } +} + +func TestMonitorSuccessResetsFailures(t *testing.T) { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Interval: time.Minute, + Timeout: time.Second, + FailureThreshold: 2, + Cooldown: time.Minute, + } + + fail := true + client := &http.Client{ + Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + if fail { + return nil, errors.New("tunnel down") + } + + return &http.Response{ + StatusCode: http.StatusOK, + Body: http.NoBody, + }, nil + }), + } + + restarts := 0 + monitor := newWithClient(cfg, client, func(ctx context.Context) error { + restarts++ + return nil + }) + + _, _ = monitor.Step(context.Background()) + + fail = false + if recovered, err := monitor.Step(context.Background()); recovered || err != nil { + t.Fatalf("successful probe should not recover or fail, recovered=%v err=%v", recovered, err) + } + + fail = true + if recovered, _ := monitor.Step(context.Background()); recovered { + t.Fatal("failure after success should be counted as first failure again") + } + + if restarts != 0 { + t.Fatalf("expected no recovery, got %d", restarts) + } +} + +func TestConfigFromEnvParsesValues(t *testing.T) { + t.Setenv("XUI_TUNNEL_HEALTH_MONITOR", "true") + t.Setenv("XUI_TUNNEL_HEALTH_URL", "https://example.com/health") + t.Setenv("XUI_TUNNEL_HEALTH_PROXY", "socks5://127.0.0.1:1080") + t.Setenv("XUI_TUNNEL_HEALTH_INTERVAL", "15s") + t.Setenv("XUI_TUNNEL_HEALTH_TIMEOUT", "3s") + t.Setenv("XUI_TUNNEL_HEALTH_FAILURES", "4") + t.Setenv("XUI_TUNNEL_HEALTH_COOLDOWN", "2m") + + cfg := ConfigFromEnv() + + if !cfg.Enabled { + t.Fatal("expected monitor to be enabled") + } + + if cfg.URL != "https://example.com/health" { + t.Fatalf("unexpected URL: %s", cfg.URL) + } + + if !strings.HasPrefix(cfg.ProxyURL, "socks5://") { + t.Fatalf("unexpected proxy URL: %s", cfg.ProxyURL) + } + + if cfg.Interval != 15*time.Second { + t.Fatalf("unexpected interval: %s", cfg.Interval) + } + + if cfg.Timeout != 3*time.Second { + t.Fatalf("unexpected timeout: %s", cfg.Timeout) + } + + if cfg.FailureThreshold != 4 { + t.Fatalf("unexpected threshold: %d", cfg.FailureThreshold) + } + + if cfg.Cooldown != 2*time.Minute { + t.Fatalf("unexpected cooldown: %s", cfg.Cooldown) + } +} + +func failingClient() *http.Client { + return &http.Client{ + Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + return nil, errors.New("tunnel down") + }), + } +} + +func statusClient(code int) *http.Client { + return &http.Client{ + Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) { + return &http.Response{StatusCode: code, Body: http.NoBody}, nil + }), + } +} + +func TestProbeStatusCodeClassification(t *testing.T) { + cases := []struct { + status int + healthy bool + }{ + {199, false}, + {200, true}, + {204, true}, + {301, true}, + {399, true}, + {400, false}, + {404, false}, + {500, false}, + } + + for _, tc := range cases { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Interval: time.Minute, + Timeout: time.Second, + FailureThreshold: 100, + Cooldown: time.Minute, + } + + monitor := newWithClient(cfg, statusClient(tc.status), func(ctx context.Context) error { + return nil + }) + + recovered, err := monitor.Step(context.Background()) + if recovered { + t.Fatalf("status %d: unexpected recovery", tc.status) + } + if tc.healthy && err != nil { + t.Fatalf("status %d: expected healthy probe, got error %v", tc.status, err) + } + if !tc.healthy && err == nil { + t.Fatalf("status %d: expected failure, got nil error", tc.status) + } + } +} + +func TestNormalizeClampsBounds(t *testing.T) { + got := Config{ + URL: " ", + Interval: 0, + Timeout: 500 * time.Millisecond, + FailureThreshold: 0, + Cooldown: 0, + }.Normalize() + + if got.URL != defaultHealthURL { + t.Fatalf("URL not defaulted: %q", got.URL) + } + if got.Interval != defaultInterval { + t.Fatalf("Interval not clamped: %s", got.Interval) + } + if got.Timeout != defaultTimeout { + t.Fatalf("Timeout not clamped: %s", got.Timeout) + } + if got.FailureThreshold != defaultFailureThreshold { + t.Fatalf("FailureThreshold not clamped: %d", got.FailureThreshold) + } + if got.Cooldown != defaultCooldown { + t.Fatalf("Cooldown not clamped: %s", got.Cooldown) + } + + valid := Config{ + URL: "https://example.com/health", + Interval: 15 * time.Second, + Timeout: 3 * time.Second, + FailureThreshold: 5, + Cooldown: 2 * time.Minute, + }.Normalize() + + if valid.URL != "https://example.com/health" || + valid.Interval != 15*time.Second || + valid.Timeout != 3*time.Second || + valid.FailureThreshold != 5 || + valid.Cooldown != 2*time.Minute { + t.Fatalf("valid config was mutated: %+v", valid) + } +} + +func TestNewRejectsUnsupportedProxyScheme(t *testing.T) { + m, err := New(Config{ProxyURL: "ftp://127.0.0.1:21"}, func(ctx context.Context) error { + return nil + }) + if err == nil || m != nil { + t.Fatalf("expected error and nil monitor for bad scheme, got m=%v err=%v", m, err) + } + + m, err = New(Config{}, func(ctx context.Context) error { + return nil + }) + if err != nil || m == nil { + t.Fatalf("expected a valid monitor for empty proxy, got m=%v err=%v", m, err) + } +} + +func TestMonitorRecoveryErrorDoesNotArmCooldown(t *testing.T) { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Interval: time.Minute, + Timeout: time.Second, + FailureThreshold: 1, + Cooldown: time.Minute, + } + + attempts := 0 + monitor := newWithClient(cfg, failingClient(), func(ctx context.Context) error { + attempts++ + return errors.New("restart failed") + }) + monitor.now = func() time.Time { + return time.Unix(100, 0) + } + + recovered, err := monitor.Step(context.Background()) + if recovered || err == nil { + t.Fatalf("failed recovery must report recovered=false with an error, got recovered=%v err=%v", recovered, err) + } + if !monitor.lastRecovery.IsZero() { + t.Fatal("a failed recovery must not arm the cooldown") + } + + if _, err := monitor.Step(context.Background()); err == nil { + t.Fatal("expected error on the second failing step") + } + if attempts != 2 { + t.Fatalf("recovery should be retried (no cooldown) after a failure, attempts=%d", attempts) + } +} + +func TestMonitorNilRecoverStaysBounded(t *testing.T) { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Interval: time.Minute, + Timeout: time.Second, + FailureThreshold: 2, + Cooldown: time.Minute, + } + + monitor := newWithClient(cfg, failingClient(), nil) + + for i := 0; i < 5; i++ { + recovered, _ := monitor.Step(context.Background()) + if recovered { + t.Fatal("a nil recovery func must never report recovery") + } + if monitor.failures > cfg.FailureThreshold { + t.Fatalf("failures must stay capped at threshold %d, got %d", cfg.FailureThreshold, monitor.failures) + } + } +} + +func TestMonitorFailuresCappedDuringCooldown(t *testing.T) { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Interval: time.Minute, + Timeout: time.Second, + FailureThreshold: 2, + Cooldown: time.Minute, + } + + restarts := 0 + monitor := newWithClient(cfg, failingClient(), func(ctx context.Context) error { + restarts++ + return nil + }) + monitor.now = func() time.Time { + return time.Unix(100, 0) + } + + monitor.Step(context.Background()) + if recovered, _ := monitor.Step(context.Background()); !recovered { + t.Fatal("expected recovery once the threshold is reached") + } + + for i := 0; i < 6; i++ { + monitor.Step(context.Background()) + if monitor.failures > cfg.FailureThreshold { + t.Fatalf("failures must never exceed threshold %d during cooldown, got %d", cfg.FailureThreshold, monitor.failures) + } + } + + if restarts != 1 { + t.Fatalf("cooldown should suppress further recoveries, restarts=%d", restarts) + } +} + +func TestMonitorRunStopsOnContextCancel(t *testing.T) { + cfg := Config{ + Enabled: true, + URL: "http://example.test", + Timeout: time.Second, + FailureThreshold: 1, + Cooldown: time.Hour, + } + + recovered := make(chan struct{}) + var once sync.Once + monitor := newWithClient(cfg, failingClient(), func(ctx context.Context) error { + once.Do(func() { close(recovered) }) + return nil + }) + monitor.cfg.Interval = 5 * time.Millisecond + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + monitor.Run(ctx) + close(done) + }() + + select { + case <-recovered: + case <-time.After(2 * time.Second): + cancel() + t.Fatal("Run did not trigger recovery within the deadline") + } + + cancel() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("Run did not return after context cancellation") + } +} diff --git a/main.go b/main.go index e4f6c13dc..f7be41d74 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( + "context" "flag" "fmt" "log" @@ -18,6 +19,7 @@ import ( "github.com/mhsanaei/3x-ui/v3/internal/database" "github.com/mhsanaei/3x-ui/v3/internal/logger" "github.com/mhsanaei/3x-ui/v3/internal/sub" + "github.com/mhsanaei/3x-ui/v3/internal/tunnelmonitor" "github.com/mhsanaei/3x-ui/v3/internal/util/crypto" "github.com/mhsanaei/3x-ui/v3/internal/util/sys" "github.com/mhsanaei/3x-ui/v3/internal/web" @@ -91,7 +93,7 @@ func runWebServer() { return } - sigCh := make(chan os.Signal, 1) + sigCh := make(chan os.Signal, 8) // Trap shutdown signals signal.Notify(sigCh, syscall.SIGHUP, syscall.SIGTERM, sys.SIGUSR1, os.Interrupt) global.SetRestartHook(func() { @@ -100,6 +102,27 @@ func runWebServer() { default: } }) + + var stopTunnelHealthMonitor context.CancelFunc + monitorCfg := tunnelmonitor.ConfigFromEnv() + if monitorCfg.Enabled { + if monitorCfg.ProxyURL == "" { + logger.Warning("Tunnel health monitor enabled without XUI_TUNNEL_HEALTH_PROXY: the probe measures host connectivity, not the xray tunnel, so failures will restart xray without fixing host network issues") + } + + monitorCtx, cancel := context.WithCancel(context.Background()) + stopTunnelHealthMonitor = cancel + + monitor, err := tunnelmonitor.New(monitorCfg, func(_ context.Context) error { + logger.Warning("Tunnel health monitor threshold reached, restarting xray-core") + return server.RestartXray() + }) + if err != nil { + logger.Warning("Tunnel health monitor disabled: ", err) + } else { + go monitor.Run(monitorCtx) + } + } for { sig := <-sigCh @@ -142,6 +165,10 @@ func runWebServer() { } default: + if stopTunnelHealthMonitor != nil { + stopTunnelHealthMonitor() + } + // --- FIX FOR TELEGRAM BOT CONFLICT (409) on full shutdown --- tgbot.StopBot() // ------------------------------------------------------------