#!/bin/bash
#
# Nagios checking script that whines if a node hasn't checked in with the
# master for a day or more and sets a critical status if the node hasn't
# checked in for a week or longer. You can of course tweak those thresholds
# if you want.

# Spot of configuration. Basically just specify where the yaml files live.
YAMLPATH=/var/lib/puppet/yaml/node
STATUS_OK=0
STATUS_WARNING=1
STATUS_CRITICAL=2
STATUS_UNKNOWN=3
INTERVAL_WARNING=$((60 * 60 * 2))
INTERVAL_CRITICAL=$((60 * 60 * 24))
PARAMS="-la"

# A space separated list of hostnames to ignore. These might for instance be
# laptops that just don't get used every day and thus don't sync.
IGNORE_HOSTS=""

# Override settings from a config file if one exists.
if [ -f /etc/default/check_puppetmaster ]; then
  . /etc/default/check_puppetmaster
fi

# Early exit if no read access to the yaml files.
if [ ! -r ${YAMLPATH} ]; then
  echo "UNKNOWN: Cannot access ${YAMLPATH}"
  exit ${STATUS_UNKNOWN}
fi

# Bunch of internal vars used for status info output.
status="OK"
ret=${STATUS_OK}
i_count=0
o_count=0
w_count=0
w_string=""
e_count=0
e_string=""

# Current time.
NOW=$(date +"%s")

# The meat.

# Get all hostnames associated with active certificates, and check the time
# each of these last checked in with the server. Do this by converting the
# yaml file expiration datestamp to epoch format and subtracting it from now.
for node in $(/usr/sbin/puppetca ${PARAMS} | awk '/^\+/ {print $2}' | tr -d '"'); do

  EXPIRATION=$(grep expiration ${YAMLPATH}/$node.yaml | awk '{printf("%s %s", $2, $3);}')
  typeset -i CHECKIN=$(date +"%s" -d "${EXPIRATION}")
  DIFFERENCE=$((${NOW} - ${CHECKIN}))

  # Count hosts and generate some output strings based on the status.
  if [ ${DIFFERENCE} -lt ${INTERVAL_WARNING} ]; then
    o_count=$((${o_count} + 1));
  else
    # If there is an issue, first check if we can ignore this host.
    if [ -n "${IGNORE_HOSTS}" ]; then
       if [[ ${IGNORE_HOSTS} =~ ${node} ]]; then
         i_count=$((${i_count} + 1))
         continue
       fi
    fi
    if [ ${DIFFERENCE} -gt ${INTERVAL_CRITICAL} ]; then
      e_count=$((${e_count} + 1))
      e_string="${e_string} ${node}"
    else
      w_count=$((${w_count} + 1))
      w_string="${w_string} ${node}"
    fi
  fi
done

# Generate a status string for user display.
if [ -n "${e_string}" ]; then
  s_string="${s_string} ${e_count} critical (${e_string## });"
fi
if [ -n "${w_string}" ]; then
  s_string="${s_string} ${w_count} warning (${w_string## });"
fi
if [ ${i_count} -gt 0 ]; then
  s_string="${s_string} ${i_count} ignored;"
fi
s_string="${s_string} ${o_count} ok."

# Create a return value and status string.
if [ ${e_count} -gt 0 ]; then
  status="CRITICAL"
  ret=${STATUS_CRITICAL}
elif [ ${w_count} -gt 0 ]; then
  status="WARNING"
  ret=${STATUS_WARNING}
fi

# Output the status and inform the user about which hosts are lagging.
echo -n "${status}:${s_string}"
exit $ret