watchcat: add optional failure timer reset
authorDharmik Parmar <redacted>
Mon, 11 May 2026 18:04:08 +0000 (23:34 +0530)
committerHannu Nyman <redacted>
Tue, 23 Jun 2026 06:16:47 +0000 (09:16 +0300)
Add an opt-in reset_failure_timer option for restart_iface and
run_script modes.

When enabled, watchcat starts a fresh failure window after the
recovery action finishes before allowing another recovery action.
The existing behavior remains the default.

Document the intended default and reset_failure_timer timing
behavior in TIMINGS.md and use a safer string comparison for the
reset_failure_timer check.

Signed-off-by: Dharmik Parmar <redacted>
utils/watchcat/Makefile
utils/watchcat/TIMINGS.md [new file with mode: 0644]
utils/watchcat/files/watchcat.config
utils/watchcat/files/watchcat.init
utils/watchcat/files/watchcat.sh
utils/watchcat/test-version.sh [new file with mode: 0755]

index b66a237ac50677751644c60194282e97601ae70f..88fb0bfc7d44b83e4defd4e463958268147fad82 100644 (file)
@@ -9,7 +9,7 @@ include $(TOPDIR)/rules.mk
 
 PKG_NAME:=watchcat
 PKG_VERSION:=1
-PKG_RELEASE:=22
+PKG_RELEASE:=23
 
 PKG_MAINTAINER:=Roger D <rogerdammit@gmail.com>
 PKG_LICENSE:=GPL-2.0
diff --git a/utils/watchcat/TIMINGS.md b/utils/watchcat/TIMINGS.md
new file mode 100644 (file)
index 0000000..960648b
--- /dev/null
@@ -0,0 +1,69 @@
+# watchcat timing notes
+
+This file documents the intended timing for the `restart_iface` and
+`run_script` paths in `watchcat.sh`, especially around the optional
+`reset_failure_timer` flag.
+
+The main point is that the repeated restart window in the default behavior is
+intentional. It is not an accidental side effect.
+
+## Terms
+
+- `failure_period`: how long failed reachability must continue before the
+  recovery action is triggered
+- `ping_frequency_interval`: how often reachability is checked
+- `recovery action`: either restarting the interface or running the configured
+  script
+
+## Default behavior
+
+By default, watchcat is meant to keep retrying during a sustained outage.
+
+This is useful for cases like WireGuard or OpenVPN, where the upstream
+internet may recover before the monitored path through the tunnel becomes
+usable again. Since watchcat is probing through the monitored interface, it
+may continue to see failed checks until that interface is restarted again.
+
+In this mode, if failed checks continue and the outage lasts long enough to
+cross multiple trigger windows, multiple recovery attempts are expected.
+
+Example:
+
+- `failure_period=60`
+- failed checks continue throughout the outage
+- the recovery action itself takes 15 seconds
+
+```text
+t=0    outage starts
+t=60   restart #1 starts
+t=75   restart #1 finishes
+t=120  restart #2
+```
+
+If connectivity has recovered by the next check, there should be no further
+restart. If failed checks continue and another trigger window is crossed,
+another restart is expected.
+
+## `reset_failure_timer=1`
+
+This mode is more conservative.
+
+Once the recovery action finishes, a fresh failure window starts from that
+point. Time spent inside the recovery action no longer counts toward the next
+trigger.
+
+Example:
+
+- `failure_period=60`
+- failed checks continue throughout the outage
+- the recovery action itself takes 15 seconds
+
+```text
+t=0    outage starts
+t=60   restart #1 starts
+t=75   restart #1 finishes
+t=135  restart #2 would be the earliest next retry
+```
+
+This mode is useful when repeated or closely spaced recovery actions are less
+desirable and a fresh failure window is preferred after each completed action.
index ed6544cc3574581c4cd4b1780bdad02600666f21..0513d411994264142c354bf4b3dc24041368387e 100644 (file)
@@ -3,3 +3,6 @@ config watchcat
        option mode 'ping_reboot'
        option pinghosts '8.8.8.8'
        option forcedelay '30'
+       # For restart_iface and run_script, start a fresh failure window after
+       # each recovery action finishes before allowing another restart.
+       # option reset_failure_timer '1'
index 9bfc68d0e3bab37862dd46bc997e1a585ffe3ca6..75a8f012457796dc41e4e47d2e0d9681b2aa3f43 100644 (file)
@@ -41,6 +41,7 @@ config_watchcat() {
        config_get interface "$1" interface
        config_get mmifacename "$1" mmifacename "null"
        config_get_bool unlockbands "$1" unlockbands "0"
+       config_get_bool reset_failure_timer "$1" reset_failure_timer "0"
        config_get addressfamily "$1" addressfamily "any"
        config_get script "$1" script
 
@@ -110,13 +111,19 @@ config_watchcat() {
                ;;
        restart_iface)
                procd_open_instance "watchcat_${1}"
-               procd_set_param command /usr/bin/watchcat.sh "restart_iface" "$period" "$pinghosts" "$pingperiod" "$pingsize" "$interface" "$mmifacename" "$unlockbands" "$addressfamily"
+               procd_set_param command /usr/bin/watchcat.sh \
+                       "restart_iface" "$period" "$pinghosts" "$pingperiod" \
+                       "$pingsize" "$interface" "$mmifacename" "$unlockbands" \
+                       "$addressfamily" "" "$reset_failure_timer"
                procd_set_param respawn "${respawn_threshold:-3600}" "${respawn_timeout:-5}" "${respawn_retry:-5}"
                procd_close_instance
                ;;
        run_script)
                procd_open_instance "watchcat_${1}"
-               procd_set_param command /usr/bin/watchcat.sh "run_script" "$period" "$pinghosts" "$pingperiod" "$pingsize" "$interface" "$addressfamily" "$script"
+               procd_set_param command /usr/bin/watchcat.sh \
+                       "run_script" "$period" "$pinghosts" "$pingperiod" \
+                       "$pingsize" "$interface" "$addressfamily" "$script" \
+                       "$reset_failure_timer"
                procd_set_param respawn "${respawn_threshold:-3600}" "${respawn_timeout:-5}" "${respawn_retry:-5}"
                procd_close_instance
                ;;
index 7770c4381be4cebc42797a90669161b89ce69ebe..a5f40628385a92a655cae72bd3699f4e3f2ef037 100644 (file)
@@ -110,6 +110,11 @@ watchcat_monitor_network() {
        mm_iface_unlock_bands="$7"
        address_family="$8"
        script="$9"
+       reset_failure_timer=""
+       if [ "$#" -gt 9 ]; then
+               shift 9
+               reset_failure_timer="$1"
+       fi
 
        time_now="$(cat /proc/uptime)"
        time_now="${time_now%%.*}"
@@ -177,7 +182,13 @@ watchcat_monitor_network() {
                                fi
                        fi
                        /etc/init.d/watchcat start
-                       # Restart timer cycle.
+                       # Optionally start a fresh failure window after the recovery action
+                       # finishes instead of continuing to count the original outage.
+                       if [ "$reset_failure_timer" = "1" ]; then
+                               time_now="$(cat /proc/uptime)"
+                               time_now="${time_now%%.*}"
+                               time_lastcheck="$time_now"
+                       fi
                        time_lastcheck_withinternet="$time_now"
                }
 
@@ -260,12 +271,47 @@ ping_reboot)
        watchcat_ping "$2" "$3" "$4" "$5" "$6" "$7" "$8"
        ;;
 restart_iface)
-       # args from init script: period pinghosts pingperiod pingsize interface mmifacename unlockbands addressfamily
-       watchcat_monitor_network "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9" ""
+       shift
+       # args from init script: period pinghosts pingperiod pingsize interface
+       # mmifacename unlockbands addressfamily script reset_failure_timer
+       failure_period="$1"
+       ping_hosts="$2"
+       ping_frequency_interval="$3"
+       ping_size="$4"
+       iface="$5"
+       mm_iface_name="$6"
+       mm_iface_unlock_bands="$7"
+       address_family="$8"
+       script="$9"
+       reset_failure_timer=""
+       if [ "$#" -gt 9 ]; then
+               shift 9
+               reset_failure_timer="$1"
+       fi
+       watchcat_monitor_network "$failure_period" "$ping_hosts" \
+               "$ping_frequency_interval" "$ping_size" "$iface" \
+               "$mm_iface_name" "$mm_iface_unlock_bands" \
+               "$address_family" "$script" "$reset_failure_timer"
        ;;
 run_script)
-       # args from init script: period pinghosts pingperiod pingsize interface addressfamily script
-       watchcat_monitor_network "$2" "$3" "$4" "$5" "$6" "" "" "$7" "$8"
+       shift
+       # args from init script: period pinghosts pingperiod pingsize interface
+       # addressfamily script reset_failure_timer
+       failure_period="$1"
+       ping_hosts="$2"
+       ping_frequency_interval="$3"
+       ping_size="$4"
+       iface="$5"
+       address_family="$6"
+       script="$7"
+       reset_failure_timer=""
+       if [ "$#" -gt 7 ]; then
+               shift 7
+               reset_failure_timer="$1"
+       fi
+       watchcat_monitor_network "$failure_period" "$ping_hosts" \
+               "$ping_frequency_interval" "$ping_size" "$iface" "" "" \
+               "$address_family" "$script" "$reset_failure_timer"
        ;;
 *)
        echo "Error: invalid mode selected: $mode"
diff --git a/utils/watchcat/test-version.sh b/utils/watchcat/test-version.sh
new file mode 100755 (executable)
index 0000000..c52d3c2
--- /dev/null
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+exit 0
git clone https://git.99rst.org/PROJECT