Alerts about your node failures

Without Grafana and prometheus. A simple bash script for telegram alerts.

Install necessary dependencies:

sudo apt update && sudo apt install -y curl jq

Save the script to a file, for example:

sudo nano $HOME/story_monitor.sh

Paste the script content into the file and save.

Script (change your TELEGRAM_TOKEN and TELEGRAM_CHAT_ID):

#!/bin/bash

# === CONFIGURATION ===
TELEGRAM_TOKEN="YOUR_TELEGRAM_BOT_TOKEN"
TELEGRAM_CHAT_ID="YOUR_TELEGRAM_CHAT_ID"
CHECK_INTERVAL=60  # Check interval in seconds
LAST_BLOCK_FILE="/tmp/last_story_block_height"
ALERT_FILE="/tmp/story_last_alert"
CPU_LIMIT=80
MEM_LIMIT=80
DISK_LIMIT=90
MIN_PEERS=3

# === FUNCTIONS ===
send_telegram() {
  MESSAGE="$1"
  echo -e "$(date): $MESSAGE"
  curl -s -X POST "https://api.telegram.org/bot$TELEGRAM_TOKEN/sendMessage" \
    -d chat_id="$TELEGRAM_CHAT_ID" \
    -d text="$MESSAGE" > /dev/null
}

can_alert() {
  NOW=$(date +%s)
  LAST=$(cat "$ALERT_FILE" 2>/dev/null || echo 0)
  if ((NOW - LAST > 600)); then
    echo "$NOW" > "$ALERT_FILE"
    return 0
  fi
  return 1
}

while true; do
  # Fetch node status
  status=$(curl -s http://localhost:${STORY_PORT}657/status)
  if [[ -z "$status" ]]; then
    if can_alert; then
      send_telegram "🚨 Node unreachable! No response from your node"
    fi
    sleep $CHECK_INTERVAL
    continue
  fi

  catching_up=$(echo "$status" | jq -r .result.sync_info.catching_up)
  height=$(echo "$status" | jq -r .result.sync_info.latest_block_height)

  # Fetch peer information
  netinfo=$(curl -s http://localhost:${STORY_PORT}657/net_info)
  peers=$(echo "$netinfo" | jq -r .result.n_peers)

  # CPU and RAM usage
  cpu_load=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
  mem_load=$(free | awk '/Mem:/ {printf("%.0f"), $3/$2 * 100.0}')

  # Disk usage
  disk_usage=$(df / | awk 'END{print $(NF-1)}' | tr -d '%')

  ALERT_MSG=""

  # Block growth check
  LAST_HEIGHT=$(cat "$LAST_BLOCK_FILE" 2>/dev/null || echo 0)
  if [[ "$height" == "$LAST_HEIGHT" ]]; then
    ALERT_MSG="$ALERT_MSG\n❗️Block height not increasing (current: $height)!"
  fi
  echo "$height" > "$LAST_BLOCK_FILE"

  # Synchronization status
  if [[ "$catching_up" == "true" ]]; then
    ALERT_MSG="$ALERT_MSG\n❗️Node is still synchronizing! Block: $height"
  fi

  # Peer count
  if [[ "$peers" -lt "$MIN_PEERS" ]]; then
    ALERT_MSG="$ALERT_MSG\n❗️Low peer count: $peers"
  fi

  # CPU usage
  CPU_INT=${cpu_load%.*}
  if [[ "$CPU_INT" -ge "$CPU_LIMIT" ]]; then
    ALERT_MSG="$ALERT_MSG\n🔥 High CPU usage: $cpu_load%"
  fi

  # RAM usage
  if [[ "$mem_load" -ge "$MEM_LIMIT" ]]; then
    ALERT_MSG="$ALERT_MSG\n🔥 High memory usage: $mem_load%"
  fi

  # Disk usage
  if [[ "$disk_usage" -ge "$DISK_LIMIT" ]]; then
    ALERT_MSG="$ALERT_MSG\n💾 High disk usage: $disk_usage%"
  fi

  # Send alert if any issues detected
  if [[ -n "$ALERT_MSG" ]]; then
    if can_alert; then
      send_telegram "Story Node Alert: $ALERT_MSG"
    fi
  fi

  sleep $CHECK_INTERVAL
done

Make the script executable:

sudo chmod +x $HOME/story_monitor.sh

Create a systemd service:

sudo tee /etc/systemd/system/story-monitor.service > /dev/null <<EOF
[Unit]
Description=Story Node Monitor
After=network.target

[Service]
User=$USER
ExecStart=$HOME/story_monitor.sh
Restart=always
RestartSec=10

[Install]
WantedBy=multi-user.target
EOF

Enable and start the service:

sudo systemctl daemon-reload
sudo systemctl enable story-monitor
sudo systemctl start story-monitor

PreviousMonitoring tool NextAptos

Last updated 2 months ago