githubEdit

check_interface_error

Check Interface Errors

  • Interface must be UP, not a loopback, and must match alias tag if given

  • If -w or -c aren't passed (or are malformed like -), then WARN and CRIT will be set to the defaults

  • Always save snapshot as:

bash
$CACHE_DIR/<host>_if<index>_<timestamp>.cache
  • If --lookback is set:

    • Find snapshot within ±60s of now - lookback

    • If not found → skip interface

  • If --lookback is NOT set:

    • Find the snapshot with the latest timestamp

    • Compare current values to that

  • Cleanup: Purge any snapshot older than max retention (e.g. --lookback + 2 minutes)

#!/bin/bash

# Nagios return codes:
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN

CACHE_DIR="/var/tmp/snmp_ifcache"
mkdir -p "$CACHE_DIR"
# SNAPSHOT_TOLERANCE=300  # 5 minutes

SHOW_ALL=0  # default: only show WARNING/CRITICAL
DEBUG=0 # Output debug data

# Handle long options first
TEMP_ARGS=()
for arg in "$@"; do
  case "$arg" in
    --all) SHOW_ALL=1 ;;
    --debug) DEBUG=1 ;;
    *) TEMP_ARGS+=("$arg") ;;
  esac
done

# Replace original arguments with cleaned-up ones
set -- "${TEMP_ARGS[@]}"


# Default retention of cache file = 5 minutes unless LOOKBACK_MINUTES is set
age_min=5
# Default thresholds
TAG=""
LOOKBACK_MINUTES=""

# ---------------------- Argument parsing ----------------------
usage() {
  echo "Usage: $0 -H host -u user -a auth_pass -A auth_proto -p priv_pass -X priv_proto [-w warn] [-c crit] [-t tag] [-l lookback_minutes]"
  echo "  --all         Include OK interfaces in output"
  echo "  --debug         Include raw SNMP counter values in output"

  exit 3
}

# Debug arguments
echo "ARGS: $@" >> /tmp/nagios_args.log

while getopts ":H:u:a:p:A:X:w:c:l:t:" opt; do
  case $opt in
    H) HOST="$OPTARG" ;;
    u) SNMP_USER="$OPTARG" ;;
    a) AUTH_PASS="$OPTARG" ;;
    p) PRIV_PASS="$OPTARG" ;;
    A) AUTH_PROTO="$OPTARG" ;;
    X) PRIV_PROTO="$OPTARG" ;;
    w) WARN="$OPTARG" ;;
    c) CRIT="$OPTARG" ;;
    t) TAG="$OPTARG" ;;
    l) LOOKBACK_MINUTES="$OPTARG" ;;
    \?) usage ;;
    :) echo "Option -$OPTARG requires an argument." >&2; usage ;;
  esac
done

# Debug Options
# echo "HOST=$HOST TAG=$TAG LOOKBACK_MINUTES=$LOOKBACK_MINUTES" >> /tmp/nagios_args_debug.log

for var in HOST SNMP_USER AUTH_PASS AUTH_PROTO PRIV_PASS PRIV_PROTO; do
  [[ -z "${!var}" ]] && echo "Missing required argument: $var" && usage
done

[[ -z "$WARN" || "$WARN" == -* ]] && WARN=1
[[ -z "$CRIT" || "$CRIT" == -* ]] && CRIT=5
[[ -z "$TAG" || "$TAG" == "-" ]] && TAG=""

snmpwalkv3() {
  snmpwalk -v3 -u "$SNMP_USER" -a "$AUTH_PROTO" -A "$AUTH_PASS" \
           -x "$PRIV_PROTO" -X "$PRIV_PASS" -l authPriv "$HOST" "$1" \
           -Oqv -m "" 2>/dev/null
}

find_snapshot_file() {
  local pattern="$1"
  local target="$2"

  local selected_file=""
  local selected_ts=0
  local closest_diff=999999

  for f in "$CACHE_DIR"/"$pattern"_*.cache; do
    [[ -f "$f" ]] || continue
    local ts="${f##*_}"; ts="${ts%.cache}"
    [[ "$ts" =~ ^[0-9]+$ ]] || continue

    if (( ts > selected_ts )); then
      selected_ts=$ts
    fi

    if (( target != 0 )); then
      local diff=$(( target > ts ? target - ts : ts - target ))
      if (( diff < closest_diff )); then
        closest_diff=$diff
        selected_file="$f"
      fi
    fi
  done

  if (( target != 0 )); then
    if (( closest_diff <= 60 )); then
      echo "$selected_file"
    else
      # Fallback: return most recent
      echo "$CACHE_DIR/${pattern}_${selected_ts}.cache"
    fi
  else
    echo "$CACHE_DIR/${pattern}_${selected_ts}.cache"
  fi
}



purge_old_snapshots() {
  if [[ "$LOOKBACK_MINUTES" =~ ^[0-9]+$ ]]; then
    age_min=$((LOOKBACK_MINUTES + 2))
  fi

  local cutoff=$(( $(date +%s) - age_min * 60 ))

  for f in "$CACHE_DIR/${HOST//./_}_if${IFINDEX}_"*.cache; do
    [[ -f "$f" ]] || continue
    local ts="${f##*_}"
    ts="${ts%.cache}"
    [[ "$ts" =~ ^[0-9]+$ ]] || continue
    (( ts < cutoff )) && rm -f "$f"
  done
}



# ---------------------- Get SNMP metadata ----------------------
mapfile -t IFINDEXES < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.1)
mapfile -t IFNAMES   < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.2)
mapfile -t IFALIAS   < <(snmpwalkv3 1.3.6.1.2.1.31.1.1.1.18)
mapfile -t IFTYPES   < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.3)
mapfile -t IFSTATUS  < <(snmpwalkv3 1.3.6.1.2.1.2.2.1.8)

now=$(date +%s)
STATUS=0
MSG=""
PERFDATA=""

# ---------------------- Per-interface loop ----------------------
displayed_count=0
processed_count=0
for i in "${!IFINDEXES[@]}"; do
  ((processed_count++))
  IFINDEX="${IFINDEXES[$i]}"
  IFNAME="${IFNAMES[$i]}"
  ALIAS="${IFALIAS[$i]}"
  IFTYPE="${IFTYPES[$i]}"
  OPERSTATUS="${IFSTATUS[$i]}"

  case "$OPERSTATUS" in
    1) OPERLABEL="up" ;;
    2) OPERLABEL="down" ;;
    3) OPERLABEL="testing" ;;
    4) OPERLABEL="unknown" ;;
    5) OPERLABEL="dormant" ;;
    6) OPERLABEL="notPresent" ;;
    7) OPERLABEL="lowerLayerDown" ;;
    *) OPERLABEL="other($OPERSTATUS)" ;;
  esac

  # Show Interface for Debugging purpose
  if (( DEBUG == 1 )); then
    echo "$IFINDEX $IFNAME $ALIAS $IFTYPE $STATUS"
  fi

  # Skip if IFType is Loopback or other(Null0) and Operational Status is not up (1)
  [[ "$IFTYPE" == "24" || "$IFTYPE" == "1" || "$OPERSTATUS" != "1" ]] && continue

  # Skip if TAG is set and ALIAS doesn't contain it
  [[ -n "$TAG" && ! "$ALIAS" == *"$TAG"* ]] && continue

  # Fetch counters
  in_err=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.14.$IFINDEX)
  out_err=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.20.$IFINDEX)
  in_disc=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.13.$IFINDEX)
  out_disc=$(snmpwalkv3 1.3.6.1.2.1.2.2.1.19.$IFINDEX)
  in_oct=$(snmpwalkv3 1.3.6.1.2.1.31.1.1.1.6.$IFINDEX)
  out_oct=$(snmpwalkv3 1.3.6.1.2.1.31.1.1.1.10.$IFINDEX)

  for v in "$in_err" "$out_err" "$in_disc" "$out_disc" "$in_oct" "$out_oct"; do
    [[ "$v" =~ ^[0-9]+$ ]] || continue 2
  done

  SNAPFILE="$CACHE_DIR/${HOST//./_}_if${IFINDEX}_$now.cache"


  # Find snapshot to compare
  if [[ -n "$LOOKBACK_MINUTES" ]]; then
    target_ts=$(date +%s -d "$LOOKBACK_MINUTES minutes ago")
    BASEFILE=$(find_snapshot_file "${HOST//./_}_if${IFINDEX}" "$target_ts")
  else
    BASEFILE=$(find_snapshot_file "${HOST//./_}_if${IFINDEX}" 0)
  fi

  # echo "Found snapshot: $BASEFILE" >> /tmp/snap_debug.log

  echo "$now $in_err $out_err $in_disc $out_disc $in_oct $out_oct" > "$SNAPFILE"
  [[ -z "$BASEFILE" ]] && continue

  read -r ts0 in_e0 out_e0 in_d0 out_d0 in_o0 out_o0 < "$BASEFILE"
  elapsed=$(( now - ts0 ))
  (( elapsed <= 0 )) && elapsed=1

  delta_err=$(( (in_err + out_err) - (in_e0 + out_e0) ))
  delta_disc=$(( (in_disc + out_disc) - (in_d0 + out_d0) ))
  delta_oct=$(( (in_oct + out_oct) - (in_o0 + out_o0) ))
  (( delta_oct <= 0 )) && delta_oct=1

  err_pct=$(awk "BEGIN { printf \"%.2f\", ($delta_err / $delta_oct) * 100 }")
  disc_pct=$(awk "BEGIN { printf \"%.2f\", ($delta_disc / $delta_oct) * 100 }")
  traffic_bps=$(awk "BEGIN { printf \"%.2f\", (8 * $delta_oct) / $elapsed }")

  # Auto-scale to most appropriate unit
  if (( $(awk "BEGIN {print ($traffic_bps >= 1000000000)}") )); then
    traffic_label=$(awk "BEGIN {printf \"%.2f Gbps\", $traffic_bps / 1000000000}")
  elif (( $(awk "BEGIN {print ($traffic_bps >= 1000000)}") )); then
    traffic_label=$(awk "BEGIN {printf \"%.2f Mbps\", $traffic_bps / 1000000}")
  elif (( $(awk "BEGIN {print ($traffic_bps >= 1000)}") )); then
    traffic_label=$(awk "BEGIN {printf \"%.2f kbps\", $traffic_bps / 1000}")
  else
    traffic_label="$traffic_bps bps"
  fi

  age=$(printf "%02dm %02ds" $((elapsed / 60)) $((elapsed % 60)))

  is_crit=$(echo "$err_pct >= $CRIT || $disc_pct >= $CRIT" | bc -l)
  is_warn=$(echo "$err_pct >= $WARN || $disc_pct >= $WARN" | bc -l)
  code=0
  state="OK"
  if (( is_crit == 1 )); then
    code=2
    state="CRITICAL"
  elif (( is_warn == 1 )); then
    code=1
    state="WARNING"
  fi

  (( code > STATUS )) && STATUS=$code

  # Only show OK if --all was used
  if [[ "$SHOW_ALL" -eq 1 || "$code" -gt 0 ]]; then
    MSG+="$state - $IFNAME ($ALIAS) is $OPERLABEL - Compared to ${age} ago: Errors ${err_pct}%, Discards ${disc_pct}%, Traffic ${traffic_label}\n"
    if (( DEBUG == 1 )); then
        MSG+="  Raw Counters: in_err=$in_e0→$in_err, out_err=$out_e0→$out_err, in_disc=$in_d0→$in_disc, out_disc=$out_d0→$out_disc, in_oct=$in_o0→$in_oct, out_oct=$out_o0→$out_oct, elapsed=${elapsed}s"
    fi
    MSG+="\n"
    
    ((displayed_count++))
  fi
 # MSG+="$state - $IFNAME ($ALIAS) is $OPERLABEL - Compared to ${age} ago: Errors ${err_pct}%, Discards ${disc_pct}%, Traffic ${traffic_bps}bps\n"
  PERFDATA+="errors_${IFINDEX}=${err_pct}%;$WARN;$CRIT discards_${IFINDEX}=${disc_pct}%;$WARN;$CRIT traffic_${IFINDEX}=${traffic_bps}bps "
  echo "$now $in_err $out_err $in_disc $out_disc $in_oct $out_oct" > "$SNAPFILE"
  
  # Purge old file when lookback is set
  # [[ -n "$LOOKBACK_MINUTES" ]] && purge_old_snapshots

  # Always purge old files
  purge_old_snapshots

done

# ---------------------- Final Output ----------------------
if (( processed_count == 0 )); then
  echo "UNKNOWN - No matching interfaces or cached data yet. Try again after a minute."
  exit 3
fi

case $STATUS in
  0) echo "OK - No Errors or Discard Detected on any interface" ;;
  1) echo "WARNING - Error or Discard is more than $WARN% on at least 1 interface" ;;
  2) echo "CRITICAL - Error or Discard is more than $CRIT% on at least 1 interface" ;;
esac

if (( displayed_count > 0 )); then
    echo -e "${MSG}"  # Remove "| $PERFDATA" if you don't need graphing tools
fi
if [[ "$SHOW_ALL" -eq 0 ]]; then
  echo "(No OK interfaces displayed — use --all to include OK ones)"
fi
exit $STATUS

Python Script

  • With timeout function

Last updated