blob: 72c0eb536debdaa79910983286f37d447c77c3cd (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
#!/bin/bash
#
# Nagios checking script that whines if a node hasn't checked in with the
# master for a day or more and sets a critical status if the node hasn't
# checked in for a week or longer. You can of course tweak those thresholds
# if you want.
# Spot of configuration. Basically just specify where the yaml files live.
YAMLPATH=/var/lib/puppet/yaml/node
STATUS_OK=0
STATUS_WARNING=1
STATUS_CRITICAL=2
STATUS_UNKNOWN=3
INTERVAL_WARNING=$((60 * 60 * 2))
INTERVAL_CRITICAL=$((60 * 60 * 24))
PARAMS="-la"
# A space separated list of hostnames to ignore. These might for instance be
# laptops that just don't get used every day and thus don't sync.
IGNORE_HOSTS=""
# Override settings from a config file if one exists.
if [ -f /etc/default/check_puppetmaster ]; then
. /etc/default/check_puppetmaster
fi
# Early exit if no read access to the yaml files.
if [ ! -r ${YAMLPATH} ]; then
echo "UNKNOWN: Cannot access ${YAMLPATH}"
exit ${STATUS_UNKNOWN}
fi
# Bunch of internal vars used for status info output.
status="OK"
ret=${STATUS_OK}
i_count=0
o_count=0
w_count=0
w_string=""
e_count=0
e_string=""
# Current time.
NOW=$(date +"%s")
# The meat.
# Get all hostnames associated with active certificates, and check the time
# each of these last checked in with the server. Do this by converting the
# yaml file expiration datestamp to epoch format and subtracting it from now.
for node in $(/usr/sbin/puppetca ${PARAMS} | awk '/^\+/ {print $2}' | tr -d '"'); do
EXPIRATION=$(grep expiration ${YAMLPATH}/$node.yaml | awk '{printf("%s %s", $2, $3);}')
typeset -i CHECKIN=$(date +"%s" -d "${EXPIRATION}")
DIFFERENCE=$((${NOW} - ${CHECKIN}))
# Count hosts and generate some output strings based on the status.
if [ ${DIFFERENCE} -lt ${INTERVAL_WARNING} ]; then
o_count=$((${o_count} + 1));
else
# If there is an issue, first check if we can ignore this host.
if [ -n "${IGNORE_HOSTS}" ]; then
if [[ ${IGNORE_HOSTS} =~ ${node} ]]; then
i_count=$((${i_count} + 1))
continue
fi
fi
if [ ${DIFFERENCE} -gt ${INTERVAL_CRITICAL} ]; then
e_count=$((${e_count} + 1))
e_string="${e_string} ${node}"
else
w_count=$((${w_count} + 1))
w_string="${w_string} ${node}"
fi
fi
done
# Generate a status string for user display.
if [ -n "${e_string}" ]; then
s_string="${s_string} ${e_count} critical (${e_string## });"
fi
if [ -n "${w_string}" ]; then
s_string="${s_string} ${w_count} warning (${w_string## });"
fi
if [ ${i_count} -gt 0 ]; then
s_string="${s_string} ${i_count} ignored;"
fi
s_string="${s_string} ${o_count} ok."
# Create a return value and status string.
if [ ${e_count} -gt 0 ]; then
status="CRITICAL"
ret=${STATUS_CRITICAL}
elif [ ${w_count} -gt 0 ]; then
status="WARNING"
ret=${STATUS_WARNING}
fi
# Output the status and inform the user about which hosts are lagging.
echo -n "${status}:${s_string}"
exit $ret
|