check_interface_error
Check Interface Errors
bash
$CACHE_DIR/<host>_if<index>_<timestamp>.cache#!/bin/bash
# Nagios return codes:
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
CACHE_DIR="/var/tmp/snmp_ifcache"
mkdir -p "$CACHE_DIR"
# SNAPSHOT_TOLERANCE=300 # 5 minutes
SHOW_ALL=0 # default: only show WARNING/CRITICAL
DEBUG=0 # Output debug data
# Handle long options first
TEMP_ARGS=()
for arg in "$@"; do
case "$arg" in
--all) SHOW_ALL=1 ;;
--debug) DEBUG=1 ;;
*) TEMP_ARGS+=("$arg") ;;
esac
done
# Replace original arguments with cleaned-up ones
set -- "${TEMP_ARGS[@]}"
# Default retention of cache file = 5 minutes unless LOOKBACK_MINUTES is set
age_min=5
# Default thresholds
TAG=""
LOOKBACK_MINUTES=""
# ---------------------- Argument parsing ----------------------
usage() {
echo "Usage: $0 -H host -u user -a auth_pass -A auth_proto -p priv_pass -X priv_proto [-w warn] [-c crit] [-t tag] [-l lookback_minutes]"
echo " --all Include OK interfaces in output"
echo " --debug Include raw SNMP counter values in output"
exit 3
}
# Debug arguments
echo "ARGS: $@" >> /tmp/nagios_args.log
while getopts ":H:u:a:p:A:X:w:c:l:t:" opt; do
case $opt in
H) HOST="$OPTARG" ;;
u) SNMP_USER="$OPTARG" ;;
a) AUTH_PASS="$OPTARG" ;;
p) PRIV_PASS="$OPTARG" ;;
A) AUTH_PROTO="$OPTARG" ;;
X) PRIV_PROTO="$OPTARG" ;;
w) WARN="$OPTARG" ;;
c) CRIT="$OPTARG" ;;
t) TAG="$OPTARG" ;;
l) LOOKBACK_MINUTES="$OPTARG" ;;
\?) usage ;;
:) echo "Option -$OPTARG requires an argument." >&2; usage ;;
esac
done
# Debug Options
# echo "HOST=$HOST TAG=$TAG LOOKBACK_MINUTES=$LOOKBACK_MINUTES" >> /tmp/nagios_args_debug.log
for var in HOST SNMP_USER AUTH_PASS AUTH_PROTO PRIV_PASS PRIV_PROTO; do
[[ -z "${!var}" ]] && echo "Missing required argument: $var" && usage
done
[[ -z "$WARN" || "$WARN" == -* ]] && WARN=1
[[ -z "$CRIT" || "$CRIT" == -* ]] && CRIT=5
[[ -z "$TAG" || "$TAG" == "-" ]] && TAG=""
snmpwalkv3() {
snmpwalk -v3 -u "$SNMP_USER" -a "$AUTH_PROTO" -A "$AUTH_PASS" \
-x "$PRIV_PROTO" -X "$PRIV_PASS" -l authPriv "$HOST" "$1" \
-Oqv -m "" 2>/dev/null
}
find_snapshot_file() {
local pattern="$1"
local target="$2"
local selected_file=""
local selected_ts=0
local closest_diff=999999
for f in "$CACHE_DIR"/"$pattern"_*.cache; do
[[ -f "$f" ]] || continue
local ts="${f##*_}"; ts="${ts%.cache}"
[[ "$ts" =~ ^[0-9]+$ ]] || continue
if (( ts > selected_ts )); then
selected_ts=$ts
fi
if (( target != 0 )); then
local diff=$(( target > ts ? target - ts : ts - target ))
if (( diff < closest_diff )); then
closest_diff=$diff
selected_file="$f"
fi
fi
done
if (( target != 0 )); then
if (( closest_diff <= 60 )); then
echo "$selected_file"
else
# Fallback: return most recent
echo "$CACHE_DIR/${pattern}_${selected_ts}.cache"
fi
else
echo "$CACHE_DIR/${pattern}_${selected_ts}.cache"
fi
}
purge_old_snapshots() {
if [[ "$LOOKBACK_MINUTES" =~ ^[0-9]+$ ]]; then
age_min=$((LOOKBACK_MINUTES + 2))
fi
local cutoff=$(( $(date +%s) - age_min * 60 ))
for f in "$CACHE_DIR/${HOST//./_}_if${IFINDEX}_"*.cache; do
[[ -f "$f" ]] || continue
local ts="${f##*_}"
ts="${ts%.cache}"
[[ "$ts" =~ ^[0-9]+$ ]] || continue
(( ts < cutoff )) && rm -f "$f"
done
}
# ---------------------- Get SNMP metadata ----------------------
mapfile -t IFINDEXES < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.1)
mapfile -t IFNAMES < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.2)
mapfile -t IFALIAS < <(snmpwalkv3 1.3.6.1.2.1.31.1.1.1.18)
mapfile -t IFTYPES < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.3)
mapfile -t IFSTATUS < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.8)
now=$(date +%s)
STATUS=0
MSG=""
PERFDATA=""
# ---------------------- Per-interface loop ----------------------
displayed_count=0
processed_count=0
for i in "${!IFINDEXES[@]}"; do
((processed_count++))
IFINDEX="${IFINDEXES[$i]}"
IFNAME="${IFNAMES[$i]}"
ALIAS="${IFALIAS[$i]}"
IFTYPE="${IFTYPES[$i]}"
OPERSTATUS="${IFSTATUS[$i]}"
case "$OPERSTATUS" in
1) OPERLABEL="up" ;;
2) OPERLABEL="down" ;;
3) OPERLABEL="testing" ;;
4) OPERLABEL="unknown" ;;
5) OPERLABEL="dormant" ;;
6) OPERLABEL="notPresent" ;;
7) OPERLABEL="lowerLayerDown" ;;
*) OPERLABEL="other($OPERSTATUS)" ;;
esac
# Show Interface for Debugging purpose
if (( DEBUG == 1 )); then
echo "$IFINDEX $IFNAME $ALIAS $IFTYPE $STATUS"
fi
# Skip if IFType is Loopback or other(Null0) and Operational Status is not up (1)
[[ "$IFTYPE" == "24" || "$IFTYPE" == "1" || "$OPERSTATUS" != "1" ]] && continue
# Skip if TAG is set and ALIAS doesn't contain it
[[ -n "$TAG" && ! "$ALIAS" == *"$TAG"* ]] && continue
# Fetch counters
in_err=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.14.$IFINDEX)
out_err=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.20.$IFINDEX)
in_disc=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.13.$IFINDEX)
out_disc=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.19.$IFINDEX)
in_oct=$(snmpwalkv3 1.3.6.1.2.1.31.1.1.1.6.$IFINDEX)
out_oct=$(snmpwalkv3 1.3.6.1.2.1.31.1.1.1.10.$IFINDEX)
for v in "$in_err" "$out_err" "$in_disc" "$out_disc" "$in_oct" "$out_oct"; do
[[ "$v" =~ ^[0-9]+$ ]] || continue 2
done
SNAPFILE="$CACHE_DIR/${HOST//./_}_if${IFINDEX}_$now.cache"
# Find snapshot to compare
if [[ -n "$LOOKBACK_MINUTES" ]]; then
target_ts=$(date +%s -d "$LOOKBACK_MINUTES minutes ago")
BASEFILE=$(find_snapshot_file "${HOST//./_}_if${IFINDEX}" "$target_ts")
else
BASEFILE=$(find_snapshot_file "${HOST//./_}_if${IFINDEX}" 0)
fi
# echo "Found snapshot: $BASEFILE" >> /tmp/snap_debug.log
echo "$now $in_err $out_err $in_disc $out_disc $in_oct $out_oct" > "$SNAPFILE"
[[ -z "$BASEFILE" ]] && continue
read -r ts0 in_e0 out_e0 in_d0 out_d0 in_o0 out_o0 < "$BASEFILE"
elapsed=$(( now - ts0 ))
(( elapsed <= 0 )) && elapsed=1
delta_err=$(( (in_err + out_err) - (in_e0 + out_e0) ))
delta_disc=$(( (in_disc + out_disc) - (in_d0 + out_d0) ))
delta_oct=$(( (in_oct + out_oct) - (in_o0 + out_o0) ))
(( delta_oct <= 0 )) && delta_oct=1
err_pct=$(awk "BEGIN { printf \"%.2f\", ($delta_err / $delta_oct) * 100 }")
disc_pct=$(awk "BEGIN { printf \"%.2f\", ($delta_disc / $delta_oct) * 100 }")
traffic_bps=$(awk "BEGIN { printf \"%.2f\", (8 * $delta_oct) / $elapsed }")
# Auto-scale to most appropriate unit
if (( $(awk "BEGIN {print ($traffic_bps >= 1000000000)}") )); then
traffic_label=$(awk "BEGIN {printf \"%.2f Gbps\", $traffic_bps / 1000000000}")
elif (( $(awk "BEGIN {print ($traffic_bps >= 1000000)}") )); then
traffic_label=$(awk "BEGIN {printf \"%.2f Mbps\", $traffic_bps / 1000000}")
elif (( $(awk "BEGIN {print ($traffic_bps >= 1000)}") )); then
traffic_label=$(awk "BEGIN {printf \"%.2f kbps\", $traffic_bps / 1000}")
else
traffic_label="$traffic_bps bps"
fi
age=$(printf "%02dm %02ds" $((elapsed / 60)) $((elapsed % 60)))
is_crit=$(echo "$err_pct >= $CRIT || $disc_pct >= $CRIT" | bc -l)
is_warn=$(echo "$err_pct >= $WARN || $disc_pct >= $WARN" | bc -l)
code=0
state="OK"
if (( is_crit == 1 )); then
code=2
state="CRITICAL"
elif (( is_warn == 1 )); then
code=1
state="WARNING"
fi
(( code > STATUS )) && STATUS=$code
# Only show OK if --all was used
if [[ "$SHOW_ALL" -eq 1 || "$code" -gt 0 ]]; then
MSG+="$state - $IFNAME ($ALIAS) is $OPERLABEL - Compared to ${age} ago: Errors ${err_pct}%, Discards ${disc_pct}%, Traffic ${traffic_label}\n"
if (( DEBUG == 1 )); then
MSG+=" Raw Counters: in_err=$in_e0→$in_err, out_err=$out_e0→$out_err, in_disc=$in_d0→$in_disc, out_disc=$out_d0→$out_disc, in_oct=$in_o0→$in_oct, out_oct=$out_o0→$out_oct, elapsed=${elapsed}s"
fi
MSG+="\n"
((displayed_count++))
fi
# MSG+="$state - $IFNAME ($ALIAS) is $OPERLABEL - Compared to ${age} ago: Errors ${err_pct}%, Discards ${disc_pct}%, Traffic ${traffic_bps}bps\n"
PERFDATA+="errors_${IFINDEX}=${err_pct}%;$WARN;$CRIT discards_${IFINDEX}=${disc_pct}%;$WARN;$CRIT traffic_${IFINDEX}=${traffic_bps}bps "
echo "$now $in_err $out_err $in_disc $out_disc $in_oct $out_oct" > "$SNAPFILE"
# Purge old file when lookback is set
# [[ -n "$LOOKBACK_MINUTES" ]] && purge_old_snapshots
# Always purge old files
purge_old_snapshots
done
# ---------------------- Final Output ----------------------
if (( processed_count == 0 )); then
echo "UNKNOWN - No matching interfaces or cached data yet. Try again after a minute."
exit 3
fi
case $STATUS in
0) echo "OK - No Errors or Discard Detected on any interface" ;;
1) echo "WARNING - Error or Discard is more than $WARN% on at least 1 interface" ;;
2) echo "CRITICAL - Error or Discard is more than $CRIT% on at least 1 interface" ;;
esac
if (( displayed_count > 0 )); then
echo -e "${MSG}" # Remove "| $PERFDATA" if you don't need graphing tools
fi
if [[ "$SHOW_ALL" -eq 0 ]]; then
echo "(No OK interfaces displayed — use --all to include OK ones)"
fi
exit $STATUSPython Script
Last updated