Proxmox Incident Script
The following script will help you determine the cause of an overloaded server that you may initially believe is caused by a network overload. This is often not the case - twice now in 2 years.
#!/usr/bin/env bash
#===============================================================================
# Script : diagnose-host-incident.sh (was: diagnose-net-incident.sh)
# Version : 3.0.0
# Created : 2026-07-05
# Updated : 2026-07-05
#
# Purpose : Forensic collector for a PAST service outage on a Proxmox host or
# LXC, where services stopped responding / transfers timed out but the
# host recovered WITHOUT a reboot. Live counters are useless after
# recovery, so this pulls the systemd/kernel JOURNAL for a window back
# from now, PLUS cumulative, boot-persistent evidence that survives a
# self-recovery across TWO exhaustion domains:
#
# MEMORY / OOM (usual root cause of a whole-host brownout):
# - every kernel OOM-kill in the window, AUTO-RESOLVED to the
# Podman container or LXC it happened in, with that cgroup's
# memory/swap ceilings (automates the manual oom_memcg lookup);
# - cumulative per-cgroup oom_kill tallies from memory.events;
# - PSI memory/io stall pressure + swap-thrash counters (the
# signature of the slow "everything times out" brownout);
# - which containers run UNBOUNDED (no mem_limit) = the risk that
# lets one service OOM the whole host cgroup.
#
# NETWORK (rule it in or out): conntrack utilisation + drop
# counters, TCP listen/accept-queue overflows, ARP/neighbour
# table, softnet backlog drops, Podman network health, NIC.
#
# Host-agnostic: auto-detects Podman and Proxmox context. Run it on
# EACH host involved (backend first, then proxy, then the PVE host).
# NOTE: kernel OOM logs for an LXC appear on the PVE HOST journal, not
# inside the LXC - so run this on pxe to see cgroup OOM kills.
#
# Usage : sudo ./diagnose-host-incident.sh [LOOKBACK_MIN] [BRIDGE] [REPORT_DIR]
# LOOKBACK_MIN - minutes of journal history to pull (default: 90)
# BRIDGE - bridge to inspect for NIC section (default: vmbr0)
# REPORT_DIR - where to write the report (default: cwd)
#
# Precise incident window via env (overrides LOOKBACK_MIN):
# SINCE="2026-07-05 10:15:00" UNTIL="2026-07-05 11:30:00" \
# sudo ./diagnose-host-incident.sh
#
# Examples:
# sudo ./diagnose-host-incident.sh # last 90 min
# sudo ./diagnose-host-incident.sh 180 # last 3 hours
# sudo ./diagnose-host-incident.sh 120 vmbr0 /var/log/incident
#
# Prereqs : Run as root (journal, dmesg, conntrack, cgroup reads need priv).
# Core : ip, journalctl, awk, ss.
# Useful : podman (auto-detected) to resolve OOM'd containers and flag
# unbounded ones; pct (PVE) to resolve LXC targets + ceilings;
# conntrack-tools, ethtool. Degrades gracefully if absent.
#
# Warnings: Strictly READ-ONLY. Changes no config, loads no modules, restarts
# nothing. Safe to re-run on a production host. Sections labelled
# (LIVE) are current-state context, not incident proof; the incident
# evidence is the JOURNAL, OOM FORENSICS, and CUMULATIVE COUNTER
# sections.
#
# Changelog:
# 3.0.0 (2026-07-05) - Renamed diagnose-net-incident.sh -> diagnose-host-
# incident.sh (scope broadened beyond network). Added the
# MEMORY/OOM domain: in-window OOM-kill capture with
# auto-resolution of oom_memcg/libpod cgroup to container
# (podman) or LXC (pct) name + that cgroup's memory/swap
# ceilings; cumulative per-cgroup oom_kill tallies via
# memory.events; PSI (/proc/pressure/*) stall pressure;
# swap-thrash counters from /proc/vmstat (pswpin/pswpout/
# pgmajfault/oom_kill); and an UNBOUNDED-container audit
# (HostConfig.Memory==0) plus per-LXC configured ceilings.
# Reordered priority so memory evidence is read first.
# 2.0.0 (2026-07-05) - Reframed NIC-hardware tool into windowed service-outage
# forensics: journal window, conntrack, TCP listen-queue,
# ARP/neighbour, softnet, Podman net health, failed units.
# Stripped benign Podman veth-rename/loop noise from greps.
# 1.0.0 (2026-07-05) - Initial release as diagnose-nic-pxe.sh (read-only NIC
# hardware/counter/IRQ snapshot behind vmbr0).
#===============================================================================
set -u
LOOKBACK_MIN="${1:-90}"
BRIDGE="${2:-vmbr0}"
REPORT_DIR="${3:-$(pwd)}"
SINCE="${SINCE:-$(date -d "${LOOKBACK_MIN} minutes ago" '+%Y-%m-%d %H:%M:%S' 2>/dev/null)}"
UNTIL="${UNTIL:-$(date '+%Y-%m-%d %H:%M:%S')}"
STAMP="$(date +%Y%m%d-%H%M%S)"
HOSTN="$(hostname -s 2>/dev/null || echo unknown)"
REPORT="${REPORT_DIR%/}/host-incident-${HOSTN}-${STAMP}.txt"
SIGNAL='nf_conntrack|table full|out of memory|oom-killer|oom-kill:|Out of memory|Killed process|invoked oom|Memory cgroup out of memory|neighbour: |neighbor table overflow|ntable overflow|TCP: out of memory|too many orphan|SYN flooding|Ran out of|no buffer space|too many open files|EMFILE|Connection timed out|conntrack|netavark|aardvark|link is Down|Link is Down|reset adapter|NETDEV WATCHDOG|Hardware Error|segfault|general protection|traps:|hung_task|blocked for more than'
NOISE='renamed from veth|detected capacity change|entered (blocking|forwarding|disabled) state|entered promiscuous mode|entered allmulticast mode|port [0-9]+\(veth'
#--- helpers -------------------------------------------------------------------
section() {
printf '\n===============================================================================\n'
printf ' %s\n' "$1"
printf '===============================================================================\n'
}
have() { command -v "$1" >/dev/null 2>&1; }
note() { printf '[note] %s\n' "$1"; }
sub() { printf -- '--- %s ---\n' "$1"; }
resolve_iface() {
local br="$1" cand
[ -d "/sys/class/net/${br}" ] || return 1
cand=$(ls "/sys/class/net/${br}/brif" 2>/dev/null | grep -vE 'tap|veth|fwln|fwpr' | head -1)
printf '%s' "$cand"
}
#--- main body (tee'd to report) -----------------------------------------------
main() {
HAS_PODMAN=0; have podman && HAS_PODMAN=1
IS_PVE=0; have pveversion && IS_PVE=1
section "HOST INCIDENT FORENSIC REPORT"
printf 'Host : %s\n' "$(hostname -f 2>/dev/null || hostname)"
printf 'Generated : %s\n' "$(date -Is)"
printf 'Kernel : %s\n' "$(uname -r)"
printf 'Uptime : %s\n' "$(uptime -p 2>/dev/null || uptime)"
printf 'Boot time : %s\n' "$(uptime -s 2>/dev/null || echo '?')"
printf 'Window SINCE: %s\n' "$SINCE"
printf 'Window UNTIL: %s\n' "$UNTIL"
printf 'Podman here : %s | Proxmox here: %s\n' \
"$([ $HAS_PODMAN -eq 1 ] && echo yes || echo no)" \
"$([ $IS_PVE -eq 1 ] && echo yes || echo no)"
printf 'Report : %s\n' "$REPORT"
note "Kernel OOM kills for an LXC log on the PVE HOST - run on pxe to see them."
note "Priority read order: 1) JOURNAL 2) OOM FORENSICS 3) MEMORY PRESSURE"
note " 4) MEMORY LIMITS 5) CONNTRACK 6) LISTEN QUEUES 7) NIC"
#========================= JOURNAL / OUTAGE EVIDENCE =======================
section "JOURNAL PERSISTENCE"
if have journalctl; then
[ -d /var/log/journal ] && note "Persistent journal present (/var/log/journal)." \
|| note "Journal may be volatile (/run) - still intact this boot (no reboot)."
journalctl --disk-usage 2>/dev/null
else
note "journalctl not available - not a systemd host?"
fi
section "INCIDENT KERNEL/JOURNAL LINES IN WINDOW (noise stripped)"
note "conntrack table full, OOM, SYN flooding, neighbour overflow, resets."
if have journalctl; then
out=$(journalctl --since "$SINCE" --until "$UNTIL" --no-pager -o short-iso 2>/dev/null \
| grep -iE "$SIGNAL" | grep -vEi "$NOISE")
if [ -n "$out" ]; then printf '%s\n' "$out"; else note "No smoking-gun lines in window. Widen LOOKBACK_MIN or set SINCE/UNTIL."; fi
fi
section "SYSTEMD ERRORS + FAILED UNITS IN WINDOW"
if have journalctl; then
sub "priority err+ in window (container noise excluded)"
journalctl --since "$SINCE" --until "$UNTIL" -p err --no-pager -o short-iso 2>/dev/null \
| grep -vEi "$NOISE" | tail -40 || note "none"
fi
if have systemctl; then
sub "currently failed units"
systemctl --failed --no-legend --no-pager 2>/dev/null || note "none"
fi
#========================= MEMORY / OOM (PRIME SUSPECT) ====================
section "OOM FORENSICS IN WINDOW (auto-resolved to container / LXC)"
note "Each kernel OOM-kill mapped to the container/LXC it hit, plus that cgroup's ceilings."
if have journalctl; then
oom_lines=$(journalctl -k --since "$SINCE" --until "$UNTIL" --no-pager -o short-iso 2>/dev/null \
| grep -iE 'oom-kill:|Killed process|Memory cgroup out of memory')
if [ -z "$oom_lines" ]; then
note "No kernel OOM-kill lines in window here."
note "If services died and this is an LXC, run on the PVE host (pxe) - that is where the kill logs."
else
printf '%s\n' "$oom_lines"
echo
sub "resolved targets"
printf '%s\n' "$oom_lines" | grep 'oom-kill:' | while IFS= read -r l; do
lxcid=$(printf '%s' "$l" | sed -nE 's/.*oom_memcg=\/lxc\/([0-9]+).*/\1/p')
cid=$(printf '%s' "$l" | sed -nE 's/.*libpod-([0-9a-f]{12,}).scope.*/\1/p')
task=$(printf '%s' "$l" | sed -nE 's/.*task=([^,]+),.*/\1/p')
printf ' task=%s' "${task:-?}"
[ -n "$lxcid" ] && printf ' | lxc=%s' "$lxcid"
[ -n "$cid" ] && printf ' | container-id=%s' "${cid:0:12}"
printf '\n'
if [ -n "$cid" ]; then
if have podman; then
nm=$(podman ps -a --filter "id=${cid:0:12}" --format '{{.Names}} ({{.Image}})' 2>/dev/null)
[ -n "$nm" ] && printf ' -> podman container: %s\n' "$nm" \
|| printf ' -> container id %s not found by podman here.\n' "${cid:0:12}"
else
printf ' -> resolve on the Podman host: podman ps -a --filter id=%s\n' "${cid:0:12}"
fi
fi
if [ -n "$lxcid" ] && have pct; then
printf ' -> LXC %s: %s\n' "$lxcid" \
"$(pct config "$lxcid" 2>/dev/null | grep -E '^(hostname|memory|swap):' | tr '\n' ' ')"
fi
done
fi
fi
section "CGROUP OOM-KILL TALLIES (CUMULATIVE - survives recovery)"
note "Every cgroup that has OOM-killed at least once since boot. Non-empty = something exceeds a limit."
if [ -d /sys/fs/cgroup ]; then
found=0
while IFS= read -r f; do
k=$(awk '/^oom_kill /{print $2}' "$f" 2>/dev/null)
if [ -n "$k" ] && [ "$k" -gt 0 ] 2>/dev/null; then
printf ' oom_kill=%-4s %s\n' "$k" "${f%/memory.events}"; found=1
fi
done < <(find /sys/fs/cgroup -name memory.events 2>/dev/null)
[ "$found" -eq 0 ] && note "No cgroup shows a non-zero oom_kill count since boot."
else
note "cgroup v2 filesystem not found."
fi
section "MEMORY PRESSURE (PSI) + SWAP THRASH (cumulative)"
note "PSI 'total' = microseconds stalled since boot (survives recovery). High = the brownout."
for f in /proc/pressure/memory /proc/pressure/io; do
[ -r "$f" ] && { sub "$f"; cat "$f"; }
done
sub "swap + oom activity since boot (/proc/vmstat)"
awk '/^(oom_kill|pswpin|pswpout|pgmajfault) /{printf " %-12s %s\n",$1,$2}' /proc/vmstat 2>/dev/null
sub "swap state + swappiness now"
printf ' vm.swappiness = %s\n' "$(sysctl -n vm.swappiness 2>/dev/null)"
free -h 2>/dev/null | awk 'NR==1 || /Mem:|Swap:/'
section "MEMORY LIMITS - BOUNDED vs UNBOUNDED"
note "An UNLIMITED container can OOM the whole host cgroup. Give heavy/AI services a mem_limit."
if [ $HAS_PODMAN -eq 1 ]; then
sub "podman containers: memory limit (UNLIMITED = risk)"
podman ps --format '{{.ID}} {{.Names}}' 2>/dev/null | while read -r id nm; do
m=$(podman inspect --format '{{.HostConfig.Memory}}' "$id" 2>/dev/null)
if [ "${m:-0}" = "0" ]; then
printf ' %-24s UNLIMITED <-- unbounded\n' "$nm"
else
printf ' %-24s limit=%.1f GiB\n' "$nm" "$(awk "BEGIN{print ${m}/1073741824}")"
fi
done
sub "live usage snapshot (top by memory)"
podman stats --no-stream --format '{{.Name}}\t{{.MemUsage}}\t{{.MemPerc}}' 2>/dev/null \
| sort -t$'\t' -k3 -rh | head -15
fi
if have pct; then
sub "LXC guests: configured memory / swap ceilings"
pct list 2>/dev/null | awk 'NR>1{print $1}' | while read -r ct; do
printf ' CT %-5s %s\n' "$ct" \
"$(pct config "$ct" 2>/dev/null | grep -E '^(hostname|memory|swap):' | tr '\n' ' ')"
done
fi
#========================= CONNTRACK ======================================
section "CONNECTION TRACKING - UTILISATION"
if [ -r /proc/sys/net/netfilter/nf_conntrack_count ]; then
cnt=$(cat /proc/sys/net/netfilter/nf_conntrack_count 2>/dev/null)
max=$(cat /proc/sys/net/netfilter/nf_conntrack_max 2>/dev/null)
printf 'nf_conntrack_count = %s\n' "$cnt"
printf 'nf_conntrack_max = %s\n' "$max"
if [ -n "${cnt:-}" ] && [ -n "${max:-}" ] && [ "$max" -gt 0 ] 2>/dev/null; then
pct=$(( cnt * 100 / max ))
printf 'utilisation (LIVE) = %s%%\n' "$pct"
[ "$pct" -ge 80 ] && note "HIGH now. If it hit 100%% during the outage, this is a cause."
fi
else
note "nf_conntrack not loaded here (host may not NAT/filter)."
fi
section "CONNTRACK DROP COUNTERS (CUMULATIVE - survives recovery)"
note "insert_failed / drop / early_drop > 0 = the table overflowed at some point since boot."
if have conntrack; then
conntrack -S 2>/dev/null | tr ' ' '\n' | grep -E 'insert_failed|drop|early_drop|invalid|error' | grep -vE '=0$' \
|| note "conntrack -S reports no non-zero drop/insert_failed counters."
elif [ -r /proc/net/stat/nf_conntrack ]; then
awk 'NR==1{for(i=1;i<=NF;i++)h[i]=$i; next}
{for(i=1;i<=NF;i++)s[i]+=strtonum("0x"$i)}
END{for(i=1;i<=NF;i++) if(h[i] ~ /insert_failed|drop|invalid|error/) printf " %-16s = %d\n", h[i], s[i]}' \
/proc/net/stat/nf_conntrack
else
note "No conntrack CLI and no /proc/net/stat/nf_conntrack."
fi
#========================= SOCKET / LISTEN-QUEUE STRESS ====================
section "TCP LISTEN/ACCEPT-QUEUE STRESS (CUMULATIVE - survives recovery)"
note "ListenOverflows/ListenDrops/TCPBacklogDrop > 0 = new connections dropped (proxy timeouts)."
if have nstat; then
nstat -az 2>/dev/null | grep -iE 'ListenOverflows|ListenDrops|TCPBacklogDrop|SyncookiesSent|TCPReqQFull' | grep -vE ' 0 ' \
|| note "No non-zero listen-queue stress counters."
sub "retransmits / pruning / orphans"
nstat -az 2>/dev/null | grep -iE 'RetransSegs|TCPLostRetransmit|PruneCalled|RcvPruned|OfoPruned|TCPAbortOnMemory' | grep -vE ' 0 '
elif have netstat; then
netstat -s 2>/dev/null | grep -iE 'listen|overflow|SYNs to LISTEN|pruned|retransmit|out of memory' || note "nothing relevant."
fi
section "CURRENT LISTEN QUEUES + BACKLOG LIMITS (LIVE)"
printf 'net.core.somaxconn = %s\n' "$(sysctl -n net.core.somaxconn 2>/dev/null)"
printf 'net.ipv4.tcp_max_syn_backlog = %s\n' "$(sysctl -n net.ipv4.tcp_max_syn_backlog 2>/dev/null)"
if have ss; then
sub "listening sockets (Recv-Q=backlog used, Send-Q=backlog max)"
ss -ltnH 2>/dev/null | awk '{printf " %-6s used=%-6s max=%-6s %s\n",$1,$2,$3,$4}' | head -25
sub "socket summary"; ss -s 2>/dev/null
fi
#========================= ARP / SOFTNET / PODMAN NET =====================
section "ARP / NEIGHBOUR TABLE (overflow -> host-unreachable timeouts)"
for k in gc_thresh1 gc_thresh2 gc_thresh3; do
printf 'ipv4.neigh.default.%-10s = %s\n' "$k" "$(sysctl -n net.ipv4.neigh.default.$k 2>/dev/null)"
done
if have ip; then
printf 'current neighbour entries : ipv4=%s ipv6=%s\n' \
"$(ip -4 neigh show 2>/dev/null | wc -l)" "$(ip -6 neigh show 2>/dev/null | wc -l)"
fi
section "PER-CPU SOFTNET DROPS (CUMULATIVE - col2; survives recovery)"
note "Non-zero 'dropped' = input backlog overflowed -> raise netdev_max_backlog."
[ -r /proc/net/softnet_stat ] && awk '{printf "cpu%-2d processed=%-12d dropped=%-10d squeezed=%d\n", NR-1, strtonum("0x"$1), strtonum("0x"$2), strtonum("0x"$3)}' /proc/net/softnet_stat
if [ $HAS_PODMAN -eq 1 ]; then
section "PODMAN NETWORK + CONTAINER HEALTH"
sub "networks"; podman network ls 2>/dev/null
sub "containers: restart counts + status (highest restarts first)"
podman ps -a --format '{{.Names}}\t{{.RestartCount}}\t{{.Status}}' 2>/dev/null | sort -t$'\t' -k2 -rn | head -30
sub "container events in window (die/oom/restart)"
podman events --since "$SINCE" --until "$UNTIL" --stream=false 2>/dev/null \
| grep -iE 'die|oom|stop|start|restart|health_status' | tail -40 || note "no events in window."
fi
#========================= NIC HARDWARE (demoted) ========================
section "NIC HARDWARE (secondary)"
IFACE="$(resolve_iface "$BRIDGE")"
if [ -z "${IFACE:-}" ]; then
note "No physical NIC behind '${BRIDGE}' (expected on an LXC using a veth)."
else
printf 'Physical NIC behind %s: %s\n' "$BRIDGE" "$IFACE"
if have ethtool; then
ethtool -i "$IFACE" 2>/dev/null | grep -E 'driver|firmware'
ethtool "$IFACE" 2>/dev/null | grep -E 'Speed|Duplex|Link detected'
sub "non-zero NIC error counters"
ethtool -S "$IFACE" 2>/dev/null | grep -iE 'err|drop|miss|fifo|over|timeout|reset|nobuf' | grep -vE ':\s*0$' \
|| note "No non-zero NIC error counters."
fi
ip -s link show "$IFACE" 2>/dev/null
fi
#========================= SYSCTL SNAPSHOT ===============================
section "RELEVANT SYSCTLS (current)"
for k in vm.swappiness vm.overcommit_memory \
net.netfilter.nf_conntrack_max net.core.somaxconn \
net.ipv4.tcp_max_syn_backlog net.core.netdev_max_backlog \
net.core.rmem_max net.core.wmem_max \
net.bridge.bridge-nf-call-iptables fs.file-nr; do
printf '%-40s = %s\n' "$k" "$(sysctl -n "$k" 2>/dev/null || echo 'n/a')"
done
section "END OF REPORT"
printf 'Saved to: %s\n' "$REPORT"
printf 'Read: 1) OOM FORENSICS 2) CGROUP OOM TALLIES 3) MEMORY PRESSURE 4) network\n'
}
#--- run: display live + persist -----------------------------------------------
mkdir -p "$REPORT_DIR" 2>/dev/null || { echo "Cannot create report dir: $REPORT_DIR" >&2; exit 1; }
main 2>&1 | tee "$REPORT"My issue was a run away AI that quickly overwhelmed the LXC application server baden.
Happy hunting
#enoughsaid