summaryrefslogtreecommitdiff
path: root/files/master/lastruncheck
blob: eeeeafbf10f046b61f3e491c54482ae665d9a77c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/bin/bash
#
# Nagios checking script that whines if a node hasn't checked in with the
# master for a day or more and sets a critical status if the node hasn't
# checked in for a week or longer. You can of course tweak those thresholds
# if you want.

# Spot of configuration. Basically just specify where the yaml files live.
YAMLPATH=/var/lib/puppet/yaml/node
STATUS_OK=0
STATUS_WARNING=1
STATUS_CRITICAL=2
STATUS_UNKNOWN=3
INTERVAL_WARNING=$((60 * 60 * 2))
INTERVAL_CRITICAL=$((60 * 60 * 24))
PUPPET="/usr/bin/puppet"
COMMAND="cert"
PARAMS="-la"
PERFDATA=no

# A space separated list of hostnames to ignore. These might for instance be
# laptops that just don't get used every day and thus don't sync.
IGNORE_HOSTS=""

# Override settings from a config file if one exists.
if [ -f /etc/default/check_puppetmaster ]; then
  . /etc/default/check_puppetmaster
fi

# Early exit if no read access to the yaml files.
if [ ! -r ${YAMLPATH} ]; then
  echo "UNKNOWN: Cannot access ${YAMLPATH}"
  exit ${STATUS_UNKNOWN}
fi

# Bunch of internal vars used for status info output.
status="OK"
ret=${STATUS_OK}
i_count=0
o_count=0
w_count=0
w_string=""
e_count=0
e_string=""
p_string=""

# Current time.
NOW=$(date +"%s")

# The meat.

# Get all hostnames associated with active certificates, and check the time
# each of these last checked in with the server. Do this by converting the
# yaml file expiration datestamp to epoch format and subtracting it from now.
for node in $(${PUPPET} ${COMMAND} ${PARAMS} | awk '/^\+/ {print $2}' | tr -d '"'); do

  if [ -e "${YAMLPATH}/$node.yaml" ]; then
    EXPIRATION=$(grep -m1 expiration ${YAMLPATH}/$node.yaml | awk '{printf("%s %s", $2, $3);}')
    typeset -i CHECKIN=$(date +"%s" -d "${EXPIRATION}")
    DIFFERENCE=$((${NOW} - ${CHECKIN}))

    # Count hosts and generate some output strings based on the status.
    if [ ${DIFFERENCE} -lt ${INTERVAL_WARNING} ]; then
      o_count=$((${o_count} + 1));
    else
      # If there is an issue, first check if we can ignore this host.
      if [ -n "${IGNORE_HOSTS}" ]; then
         if [[ ${IGNORE_HOSTS} =~ ${node} ]]; then
           i_count=$((${i_count} + 1))
           continue
         fi
      fi
      if [ ${DIFFERENCE} -gt ${INTERVAL_CRITICAL} ]; then
        e_count=$((${e_count} + 1))
        e_string="${e_string} ${node}"
      else
        w_count=$((${w_count} + 1))
        w_string="${w_string} ${node}"
      fi
    fi
  fi
done

# Generate a status string for user display.
if [ -n "${e_string}" ]; then
  s_string="${s_string} ${e_count} critical (${e_string## });"
fi
if [ -n "${w_string}" ]; then
  s_string="${s_string} ${w_count} warning (${w_string## });"
fi
if [ ${i_count} -gt 0 ]; then
  s_string="${s_string} ${i_count} ignored;"
fi
s_string="${s_string} ${o_count} ok."

# Create a return value and status string.
if [ ${e_count} -gt 0 ]; then
  status="CRITICAL"
  ret=${STATUS_CRITICAL}
elif [ ${w_count} -gt 0 ]; then
  status="WARNING"
  ret=${STATUS_WARNING}
fi

# Optionally add perfdata to output.
if [ "${PERFDATA}" = "yes" ]; then
  p_string=" | ok=${o_count}, err=${e_count}, warn=${w_count}, ign=${i_count}"
fi


# Output the status and inform the user about which hosts are lagging.
echo -n "${status}:${s_string}${p_string}"
exit $ret