diff options
Diffstat (limited to 'puppet/modules/site_check_mk')
39 files changed, 1501 insertions, 0 deletions
diff --git a/puppet/modules/site_check_mk/files/agent/local_checks/all_hosts/run_node_tests.sh b/puppet/modules/site_check_mk/files/agent/local_checks/all_hosts/run_node_tests.sh new file mode 100644 index 00000000..1dd0afc9 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/local_checks/all_hosts/run_node_tests.sh @@ -0,0 +1,5 @@ +#!/bin/sh +# +# runs node tests + +/srv/leap/bin/run_tests --checkmk diff --git a/puppet/modules/site_check_mk/files/agent/local_checks/couchdb/leap_couch_stats.sh b/puppet/modules/site_check_mk/files/agent/local_checks/couchdb/leap_couch_stats.sh new file mode 100755 index 00000000..c7477b18 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/local_checks/couchdb/leap_couch_stats.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# +# todo: +# - thresholds +# - couch response time +# - make CURL/URL/DBLIST_EXCLUDE vars configurable +# - move load_nagios_utils() to helper library so we can use it from multiple scripts + +start_time=$(date +%s.%N) + +CURL='curl -s --netrc-file /etc/couchdb/couchdb.netrc' +URL='http://127.0.0.1:5984' +TMPFILE=$(mktemp) +DBLIST_EXCLUDE='(user-|sessions_|tokens_|_replicator|_users)' +PREFIX='Couchdb_' + + +load_nagios_utils () { + # load the nagios utils + # in debian, the package nagios-plugins-common installs utils.sh to /usr/lib/nagios/plugins/utils.sh + utilsfn= + for d in $PROGPATH /usr/lib/nagios/plugins /usr/lib64/nagios/plugins /usr/local/nagios/libexec /opt/nagios-plugins/libexec . ; do + if [ -f "$d/utils.sh" ]; then + utilsfn=$d/utils.sh; + fi + done + if [ "$utilsfn" = "" ]; then + echo "UNKNOWN - cannot find utils.sh (part of nagios plugins)"; + exit 3; + fi + . "$utilsfn"; + STATE[$STATE_OK]='OK' + STATE[$STATE_WARNING]='Warning' + STATE[$STATE_CRITICAL]='Critical' + STATE[$STATE_UNKNOWN]='Unknown' + STATE[$STATE_DEPENDENT]='Dependend' +} + +get_global_stats_perf () { + trap "localexit=3" ERR + local localexit db_count + localexit=0 + + # get a list of all dbs + $CURL -X GET $URL/_all_dbs | json_pp | egrep -v '(\[|\])' > $TMPFILE + + db_count=$( wc -l < $TMPFILE) + excluded_db_count=$( egrep -c "$DBLIST_EXCLUDE" $TMPFILE ) + + echo "db_count=$db_count|excluded_db_count=$excluded_db_count" + return ${localexit} +} + +db_stats () { + trap "localexit=3" ERR + local db db_stats doc_count del_doc_count localexit + localexit=0 + + db="$1" + name="$2" + + if [ -z "$name" ] + then + name="$db" + fi + + perf="$perf|${db}_docs=$( $CURL -s -X GET ${URL}/$db | json_pp |grep 'doc_count' | sed 's/[^0-9]//g' )" + db_stats=$( $CURL -s -X GET ${URL}/$db | json_pp ) + + doc_count=$( echo "$db_stats" | grep 'doc_count' | grep -v 'deleted_doc_count' | sed 's/[^0-9]//g' ) + del_doc_count=$( echo "$db_stats" | grep 'doc_del_count' | sed 's/[^0-9]//g' ) + + # don't divide by zero + if [ $del_doc_count -eq 0 ] + then + del_doc_perc=0 + else + del_doc_perc=$(( del_doc_count * 100 / doc_count )) + fi + + bytes=$( echo "$db_stats" | grep disk_size | sed 's/[^0-9]//g' ) + disk_size=$( echo "scale = 2; $bytes / 1024 / 1024" | bc -l ) + + echo -n "${localexit} ${PREFIX}${name}_database ${name}_docs=$doc_count|${name}_deleted_docs=$del_doc_count|${name}_deleted_docs_percentage=${del_doc_perc}%" + printf "|${name}_disksize_mb=%02.2fmb ${STATE[localexit]}: database $name\n" "$disk_size" + + return ${localexit} +} + +# main + +load_nagios_utils + +# per-db stats +# get a list of all dbs +$CURL -X GET $URL/_all_dbs | json_pp | egrep -v '(\[|\])' > $TMPFILE + +# get list of dbs to check +dbs=$( egrep -v "${DBLIST_EXCLUDE}" $TMPFILE | tr -d '\n"' | sed 's/,/ /g' ) + +for db in $dbs +do + db_stats "$db" +done + +# special handling for rotated dbs +suffix=$(($(date +'%s') / (60*60*24*30))) +db_stats "sessions_${suffix}" "sessions" +db_stats "tokens_${suffix}" "tokens" + + +# show global couchdb stats +global_stats_perf=$(get_global_stats_perf) +exitcode=$? + +end_time=$(date +%s.%N) +duration=$( echo "scale = 2; $end_time - $start_time" | bc -l ) + +printf "${exitcode} ${PREFIX}global_stats ${global_stats_perf}|script_duration=%02.2fs ${STATE[exitcode]}: global couchdb status\n" "$duration" + +rm "$TMPFILE" + diff --git a/puppet/modules/site_check_mk/files/agent/local_checks/mx/check_leap_mx.sh b/puppet/modules/site_check_mk/files/agent/local_checks/mx/check_leap_mx.sh new file mode 100755 index 00000000..4711e247 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/local_checks/mx/check_leap_mx.sh @@ -0,0 +1,33 @@ +#!/bin/bash + + +WARN=1 +CRIT=5 + +# in minutes +MAXAGE=10 + +STATUS[0]='OK' +STATUS[1]='Warning' +STATUS[2]='Critical' +CHECKNAME='Leap_MX_Queue' + +WATCHDIR='/var/mail/leap-mx/Maildir/new/' + + +total=`find $WATCHDIR -type f -mmin +$MAXAGE | wc -l` + +if [ $total -lt $WARN ] +then + exitcode=0 +else + if [ $total -le $CRIT ] + then + exitcode=1 + else + exitcode=2 + fi +fi + +echo "${exitcode} ${CHECKNAME} stale_files=${total} ${STATUS[exitcode]}: ${total} stale files (>=${MAXAGE} min) in ${WATCHDIR}." + diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/bigcouch.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/bigcouch.cfg new file mode 100644 index 00000000..0f378a5a --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/bigcouch.cfg @@ -0,0 +1,28 @@ +/opt/bigcouch/var/log/bigcouch.log nocontext=1 +# ignore requests that are fine + I undefined - -.*200$ + I undefined - -.*201$ + I 127.0.0.1 undefined.* ok + I 127.0.0.1 localhost:5984 .* ok + # https://leap.se/code/issues/5246 + I Shutting down group server + # ignore bigcouch conflict errors + I Error in process.*{{nocatch,conflict} + # ignore "Uncaught error in HTTP request: {exit, normal}" error + # it's suppressed in later versions of bigcouch anhow + # see https://leap.se/code/issues/5226 + I Uncaught error in HTTP request: {exit,normal} + I Uncaught error in HTTP request: {exit, + # Ignore rexi_EXIT bigcouch error (Bug #6512) + I Error in process <[0-9.]+> on node .* with exit value: {{rexi_EXIT,{(killed|noproc|shutdown),\[{couch_db,collect_results + # Ignore "Generic server terminating" bigcouch message (Feature #6544) + I Generic server <.*> terminating + I {error_report,<.*>, + I {error_info, + C Uncaught error in HTTP request: {error, + C Response abnormally terminated: {nodedown, + C rexi_DOWN,noproc + C rexi_DOWN,noconnection + C error + C Connection attempt from disallowed node + W Apache CouchDB has started diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/leap_mx.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/leap_mx.cfg new file mode 100644 index 00000000..166d0230 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/leap_mx.cfg @@ -0,0 +1,4 @@ +/var/log/leap/mx.log + W Don't know how to deliver mail + W No public key, stopping the processing chain + diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/logwatch.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/logwatch.cfg new file mode 100644 index 00000000..4f16d1bd --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/logwatch.cfg @@ -0,0 +1,31 @@ +# This file is managed by Puppet. DO NOT EDIT. + +# logwatch.cfg +# This file configures mk_logwatch. Define your logfiles +# and patterns to be looked for here. + +# Name one or more logfiles +/var/log/messages +# Patterns are indented with one space are prefixed with: +# C: Critical messages +# W: Warning messages +# I: ignore these lines (OK) +# The first match decided. Lines that do not match any pattern +# are ignored + C Fail event detected on md device + I mdadm.*: Rebuild.*event detected + W mdadm\[ + W ata.*hard resetting link + W ata.*soft reset failed (.*FIS failed) + W device-mapper: thin:.*reached low water mark + C device-mapper: thin:.*no free space + +/var/log/auth.log + W sshd.*Corrupted MAC on input + +/var/log/kern.log + C panic + C Oops + W generic protection rip + W .*Unrecovered read error - auto reallocate failed + diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/openvpn.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/openvpn.cfg new file mode 100644 index 00000000..d99dcde9 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/openvpn.cfg @@ -0,0 +1,19 @@ +/var/log/leap/openvpn.log +# ignore openvpn TLS initialization errors when clients +# suddenly hangup before properly establishing +# a tls connection + I ovpn-.*TLS Error: Unroutable control packet received from + I ovpn-.*TLS Error: TLS key negotiation failed to occur within 60 seconds \(check your network connectivity\) + I ovpn-.*TLS Error: TLS handshake failed + I ovpn-.*TLS Error: TLS object -> incoming plaintext read error + I ovpn-.*Fatal TLS error \(check_tls_errors_co\), restarting + I ovpn-.*TLS_ERROR: BIO read tls_read_plaintext error: error:140890B2:SSL routines:SSL3_GET_CLIENT_CERTIFICATE:no certificate + I ovpn-.*TLS_ERROR: BIO read tls_read_plaintext error: error:140890C7:SSL routines:SSL3_GET_CLIENT_CERTIFICATE:peer did not return a certificate + I ovpn-.*TLS Error: unknown opcode received from + I ovpn-.*Authenticate/Decrypt packet error: packet HMAC authentication failed + I ovpn-.*TLS Error: reading acknowledgement record from packet + I ovpn-.*TLS Error: session-id not found in packet from + + I ovpn-.*SIGUSR1\[soft,tls-error\] received, client-instance restarting + I ovpn-.*VERIFY ERROR: depth=0, error=certificate has expired + diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/soledad.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/soledad.cfg new file mode 100644 index 00000000..3af5045b --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/soledad.cfg @@ -0,0 +1,6 @@ +/var/log/soledad.log + C WSGI application error + C Error + C error +# Removed this line because we determined it was better to ignore it (#6566) +# W Timing out client: diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/stunnel.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/stunnel.cfg new file mode 100644 index 00000000..b1e6cf2f --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/stunnel.cfg @@ -0,0 +1,10 @@ +/var/log/leap/stunnel.log +# check for stunnel failures +# +# these are temporary failures and happen very often, so we +# ignore them until we tuned stunnel timeouts/logging, +# see https://leap.se/code/issues/5218 + I stunnel:.*Connection reset by peer + I stunnel:.*Peer suddenly disconnected + I stunnel:.*Connection refused + diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/syslog/bigcouch.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/syslog/bigcouch.cfg new file mode 100644 index 00000000..f53f0780 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/syslog/bigcouch.cfg @@ -0,0 +1,5 @@ +# on one-node bigcouch setups, we'll get this msg +# a lot, so we ignore it here until we fix +# https://leap.se/code/issues/5244 + I epmd: got partial packet only on file descriptor + diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/syslog/couchdb.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/syslog/couchdb.cfg new file mode 100644 index 00000000..5f8d5b95 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/syslog/couchdb.cfg @@ -0,0 +1,2 @@ + C /usr/local/bin/couch-doc-update.*failed + C /usr/local/bin/couch-doc-update.*ERROR diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/syslog_header.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/syslog_header.cfg new file mode 100644 index 00000000..f60d752b --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/syslog_header.cfg @@ -0,0 +1 @@ +/var/log/syslog diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/syslog_tail.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/syslog_tail.cfg new file mode 100644 index 00000000..7daf0cac --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/syslog_tail.cfg @@ -0,0 +1,21 @@ +# some general patterns + I Error: Driver 'pcspkr' is already registered, aborting... +# ignore postfix errors on lost connection (Bug #6476) + I postfix/smtpd.*SSL_accept error from.*lost connection +# ignore postfix too many errors after DATA (#6545) + I postfix/smtpd.*too many errors after DATA from + C panic + C Oops + C Error +# ignore ipv6 icmp errors for now (Bug #6540) + I kernel: .*icmpv6_send: no reply to icmp error + C error + W generic protection rip + W .*Unrecovered read error - auto reallocate failed +# 401 Unauthorized error logged by webapp and possible other +# applications + C Unauthorized +# catch abnormal termination of processes (due to segfault/fpe +# signals etc). +# see https://github.com/pixelated/pixelated-user-agent/issues/683 + C systemd.*: main process exited, code=killed, status= diff --git a/puppet/modules/site_check_mk/files/agent/logwatch/webapp.cfg b/puppet/modules/site_check_mk/files/agent/logwatch/webapp.cfg new file mode 100644 index 00000000..337d9ec6 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/logwatch/webapp.cfg @@ -0,0 +1,8 @@ +/var/log/leap/webapp.log +# check for webapp errors + C Completed 500 +# couch connection issues + C webapp.*Could not connect to couch database messages due to 401 Unauthorized: {"error":"unauthorized","reason":"You are not a server admin."} +# ignore RoutingErrors that rails throw when it can't handle a url +# see https://leap.se/code/issues/5173 + I webapp.*ActionController::RoutingError diff --git a/puppet/modules/site_check_mk/files/agent/nagios_plugins/check_unix_open_fds.pl b/puppet/modules/site_check_mk/files/agent/nagios_plugins/check_unix_open_fds.pl new file mode 100755 index 00000000..06163d49 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/nagios_plugins/check_unix_open_fds.pl @@ -0,0 +1,322 @@ +#!/usr/bin/perl -w + +# check_unix_open_fds Nagios Plugin +# +# TComm - Carlos Peris Pla +# +# This nagios plugin is free software, and comes with ABSOLUTELY +# NO WARRANTY. It may be used, redistributed and/or modified under +# the terms of the GNU General Public Licence (see +# http://www.fsf.org/licensing/licenses/gpl.txt). + + +# MODULE DECLARATION + +use strict; +use Nagios::Plugin; + + +# FUNCTION DECLARATION + +sub CreateNagiosManager (); +sub CheckArguments (); +sub PerformCheck (); + + +# CONSTANT DEFINITION + +use constant NAME => 'check_unix_open_fds'; +use constant VERSION => '0.1b'; +use constant USAGE => "Usage:\ncheck_unix_open_fds -w <process_threshold,application_threshold> -c <process_threshold,application_threshold>\n". + "\t\t[-V <version>]\n"; +use constant BLURB => "This plugin checks, in UNIX systems with the command lsof installed and with its SUID bit activated, the number\n". + "of file descriptors opened by an application and its processes.\n"; +use constant LICENSE => "This nagios plugin is free software, and comes with ABSOLUTELY\n". + "no WARRANTY. It may be used, redistributed and/or modified under\n". + "the terms of the GNU General Public Licence\n". + "(see http://www.fsf.org/licensing/licenses/gpl.txt).\n"; +use constant EXAMPLE => "\n\n". + "Example:\n". + "\n". + "check_unix_open_fds -a /usr/local/nagios/bin/ndo2db -w 20,75 -c 25,85\n". + "\n". + "It returns CRITICAL if number of file descriptors opened by ndo2db is higher than 85,\n". + "if not it returns WARNING if number of file descriptors opened by ndo2db is higher \n". + "than 75, if not it returns CRITICAL if number of file descriptors opened by any process\n". + "of ndo2db is higher than 25, if not it returns WARNING if number of file descriptors \n". + "opened by any process of ndo2db is higher than 20.\n". + "In other cases it returns OK if check has been performed succesfully.\n\n"; + + +# VARIABLE DEFINITION + +my $Nagios; +my $Error; +my $PluginResult; +my $PluginOutput; +my @WVRange; +my @CVRange; + + +# MAIN FUNCTION + +# Get command line arguments +$Nagios = &CreateNagiosManager(USAGE, VERSION, BLURB, LICENSE, NAME, EXAMPLE); +eval {$Nagios->getopts}; + +if (!$@) { + # Command line parsed + if (&CheckArguments($Nagios, \$Error, \@WVRange, \@CVRange)) { + # Argument checking passed + $PluginResult = &PerformCheck($Nagios, \$PluginOutput, \@WVRange, \@CVRange) + } + else { + # Error checking arguments + $PluginOutput = $Error; + $PluginResult = UNKNOWN; + } + $Nagios->nagios_exit($PluginResult,$PluginOutput); +} +else { + # Error parsing command line + $Nagios->nagios_exit(UNKNOWN,$@); +} + + + +# FUNCTION DEFINITIONS + +# Creates and configures a Nagios plugin object +# Input: strings (usage, version, blurb, license, name and example) to configure argument parsing functionality +# Return value: reference to a Nagios plugin object + +sub CreateNagiosManager() { + # Create GetOpt object + my $Nagios = Nagios::Plugin->new(usage => $_[0], version => $_[1], blurb => $_[2], license => $_[3], plugin => $_[4], extra => $_[5]); + + # Add argument units + $Nagios->add_arg(spec => 'application|a=s', + help => 'Application path for which you want to check the number of open file descriptors', + required => 1); + + # Add argument warning + $Nagios->add_arg(spec => 'warning|w=s', + help => "Warning thresholds. Format: <process_threshold,application_threshold>", + required => 1); + # Add argument critical + $Nagios->add_arg(spec => 'critical|c=s', + help => "Critical thresholds. Format: <process_threshold,application_threshold>", + required => 1); + + # Return value + return $Nagios; +} + + +# Checks argument values and sets some default values +# Input: Nagios Plugin object +# Output: reference to Error description string, Memory Unit, Swap Unit, reference to WVRange ($_[4]), reference to CVRange ($_[5]) +# Return value: True if arguments ok, false if not + +sub CheckArguments() { + my ($Nagios, $Error, $WVRange, $CVRange) = @_; + my $commas; + my $units; + my $i; + my $firstpos; + my $secondpos; + + # Check Warning thresholds list + $commas = $Nagios->opts->warning =~ tr/,//; + if ($commas !=1){ + ${$Error} = "Invalid Warning list format. One comma is expected."; + return 0; + } + else{ + $i=0; + $firstpos=0; + my $warning=$Nagios->opts->warning; + while ($warning =~ /[,]/g) { + $secondpos=pos $warning; + if ($secondpos - $firstpos==1){ + @{$WVRange}[$i] = "~:"; + } + else{ + @{$WVRange}[$i] = substr $Nagios->opts->warning, $firstpos, ($secondpos-$firstpos-1); + } + $firstpos=$secondpos; + $i++ + } + if (length($Nagios->opts->warning) - $firstpos==0){#La coma es el ultimo elemento del string + @{$WVRange}[$i] = "~:"; + } + else{ + @{$WVRange}[$i] = substr $Nagios->opts->warning, $firstpos, (length($Nagios->opts->warning)-$firstpos); + } + + if (@{$WVRange}[0] !~/^(@?(\d+|(\d+|~):(\d+)?))?$/){ + ${$Error} = "Invalid Process Warning threshold in ${$WVRange[0]}"; + return 0; + }if (@{$WVRange}[1] !~/^(@?(\d+|(\d+|~):(\d+)?))?$/){ + ${$Error} = "Invalid Application Warning threshold in ${$WVRange[1]}"; + return 0; + } + } + + # Check Critical thresholds list + $commas = $Nagios->opts->critical =~ tr/,//; + if ($commas !=1){ + ${$Error} = "Invalid Critical list format. One comma is expected."; + return 0; + } + else{ + $i=0; + $firstpos=0; + my $critical=$Nagios->opts->critical; + while ($critical =~ /[,]/g) { + $secondpos=pos $critical ; + if ($secondpos - $firstpos==1){ + @{$CVRange}[$i] = "~:"; + } + else{ + @{$CVRange}[$i] =substr $Nagios->opts->critical, $firstpos, ($secondpos-$firstpos-1); + } + $firstpos=$secondpos; + $i++ + } + if (length($Nagios->opts->critical) - $firstpos==0){#La coma es el ultimo elemento del string + @{$CVRange}[$i] = "~:"; + } + else{ + @{$CVRange}[$i] = substr $Nagios->opts->critical, $firstpos, (length($Nagios->opts->critical)-$firstpos); + } + + if (@{$CVRange}[0] !~/^(@?(\d+|(\d+|~):(\d+)?))?$/) { + ${$Error} = "Invalid Process Critical threshold in @{$CVRange}[0]"; + return 0; + } + if (@{$CVRange}[1] !~/^(@?(\d+|(\d+|~):(\d+)?))?$/) { + ${$Error} = "Invalid Application Critical threshold in @{$CVRange}[1]"; + return 0; + } + } + + return 1; +} + + +# Performs whole check: +# Input: Nagios Plugin object, reference to Plugin output string, Application, referece to WVRange, reference to CVRange +# Output: Plugin output string +# Return value: Plugin return value + +sub PerformCheck() { + my ($Nagios, $PluginOutput, $WVRange, $CVRange) = @_; + my $Application; + my @AppNameSplitted; + my $ApplicationName; + my $PsCommand; + my $PsResult; + my @PsResultLines; + my $ProcLine; + my $ProcPid; + my $LsofCommand; + my $LsofResult; + my $ProcCount = 0; + my $FDCount = 0; + my $ProcFDAvg = 0; + my $PerProcMaxFD = 0; + my $ProcOKFlag = 0; + my $ProcWarningFlag = 0; + my $ProcCriticalFlag = 0; + my $OKFlag = 0; + my $WarningFlag = 0; + my $CriticalFlag = 0; + my $LastWarningProcFDs = 0; + my $LastWarningProc = -1; + my $LastCriticalProcFDs = 0; + my $LastCriticalProc = -1; + my $ProcPluginReturnValue = UNKNOWN; + my $AppPluginReturnValue = UNKNOWN; + my $PluginReturnValue = UNKNOWN; + my $PerformanceData = ""; + my $PerfdataUnit = "FDs"; + + $Application = $Nagios->opts->application; + $PsCommand = "ps -eaf | grep $Application"; + $PsResult = `$PsCommand`; + @AppNameSplitted = split(/\//, $Application); + $ApplicationName = $AppNameSplitted[$#AppNameSplitted]; + @PsResultLines = split(/\n/, $PsResult); + if ( $#PsResultLines > 1 ) { + foreach my $Proc (split(/\n/, $PsResult)) { + if ($Proc !~ /check_unix_open_fds/ && $Proc !~ / grep /) { + $ProcCount += 1; + $ProcPid = (split(/\s+/, $Proc))[1]; + $LsofCommand = "lsof -p $ProcPid | wc -l"; + $LsofResult = `$LsofCommand`; + $LsofResult = ($LsofResult > 0 ) ? ($LsofResult - 1) : 0; + $FDCount += $LsofResult; + if ($LsofResult >= $PerProcMaxFD) { $PerProcMaxFD = $LsofResult; } + $ProcPluginReturnValue = $Nagios->check_threshold(check => $LsofResult,warning => @{$WVRange}[0],critical => @{$CVRange}[0]); + if ($ProcPluginReturnValue eq OK) { + $ProcOKFlag = 1; + } + elsif ($ProcPluginReturnValue eq WARNING) { + $ProcWarningFlag = 1; + if ($LsofResult >= $LastWarningProcFDs) { + $LastWarningProcFDs = $LsofResult; + $LastWarningProc = $ProcPid; + } + } + #if ($LsofResult >= $PCT) { + elsif ($ProcPluginReturnValue eq CRITICAL) { + $ProcCriticalFlag = 1; + if ($LsofResult >= $LastCriticalProcFDs) { + $LastCriticalProcFDs = $LsofResult; + $LastCriticalProc = $ProcPid; + } + } + } + } + if ($ProcCount) { $ProcFDAvg = int($FDCount / $ProcCount); } + $AppPluginReturnValue = $Nagios->check_threshold(check => $FDCount,warning => @{$WVRange}[1],critical => @{$CVRange}[1]); + #if ($FDCount >= $TWT) { + if ($AppPluginReturnValue eq OK) { $OKFlag = 1; } + elsif ($AppPluginReturnValue eq WARNING) { $WarningFlag = 1; } + elsif ($AppPluginReturnValue eq CRITICAL) { $CriticalFlag = 1; } + + # PluginReturnValue and PluginOutput + if ($CriticalFlag) { + $PluginReturnValue = CRITICAL; + ${$PluginOutput} .= "$ApplicationName handling $FDCount files (critical threshold set to @{$CVRange}[1])"; + } + elsif ($WarningFlag) { + $PluginReturnValue = WARNING; + ${$PluginOutput} .= "$ApplicationName handling $FDCount files (warning threshold set to @{$WVRange}[1])"; + } + elsif ($ProcCriticalFlag) { + $PluginReturnValue = CRITICAL; + ${$PluginOutput} .= "Process ID $LastCriticalProc handling $LastCriticalProcFDs files (critical threshold set to @{$CVRange}[0])"; + } + elsif ($ProcWarningFlag) { + $PluginReturnValue = WARNING; + ${$PluginOutput} .= "Process ID $LastWarningProc handling $LastWarningProcFDs files (warning threshold set to @{$WVRange}[0])"; + } + elsif ($OKFlag && $ProcOKFlag) { + $PluginReturnValue = OK; + ${$PluginOutput} .= "$ApplicationName handling $FDCount files"; + } + } + else { + ${$PluginOutput} .= "No existe la aplicacion $ApplicationName"; + } + + + $PerformanceData .= "ProcCount=$ProcCount$PerfdataUnit FDCount=$FDCount$PerfdataUnit ProcFDAvg=$ProcFDAvg$PerfdataUnit PerProcMaxFD=$PerProcMaxFD$PerfdataUnit"; + + # Output with performance data: + ${$PluginOutput} .= " | $PerformanceData"; + + return $PluginReturnValue; +} diff --git a/puppet/modules/site_check_mk/files/agent/plugins/mk_logwatch.1.2.4 b/puppet/modules/site_check_mk/files/agent/plugins/mk_logwatch.1.2.4 new file mode 100755 index 00000000..3dbca322 --- /dev/null +++ b/puppet/modules/site_check_mk/files/agent/plugins/mk_logwatch.1.2.4 @@ -0,0 +1,374 @@ +#!/usr/bin/python +# -*- encoding: utf-8; py-indent-offset: 4 -*- +# +------------------------------------------------------------------+ +# | ____ _ _ __ __ _ __ | +# | / ___| |__ ___ ___| | __ | \/ | |/ / | +# | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / | +# | | |___| | | | __/ (__| < | | | | . \ | +# | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ | +# | | +# | Copyright Mathias Kettner 2010 mk@mathias-kettner.de | +# +------------------------------------------------------------------+ +# +# This file is part of Check_MK. +# The official homepage is at http://mathias-kettner.de/check_mk. +# +# check_mk is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation in version 2. check_mk is distributed +# in the hope that it will be useful, but WITHOUT ANY WARRANTY; with- +# out even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU General Public License for more de- +# ails. You should have received a copy of the GNU General Public +# License along with GNU Make; see the file COPYING. If not, write +# to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, +# Boston, MA 02110-1301 USA. + +# Call with -d for debug mode: colored output, no saving of status + +import sys, os, re, time +import glob + +if '-d' in sys.argv[1:] or '--debug' in sys.argv[1:]: + tty_red = '\033[1;31m' + tty_green = '\033[1;32m' + tty_yellow = '\033[1;33m' + tty_blue = '\033[1;34m' + tty_normal = '\033[0m' + debug = True +else: + tty_red = '' + tty_green = '' + tty_yellow = '' + tty_blue = '' + tty_normal = '' + debug = False + +# The configuration file and status file are searched +# in the directory named by the environment variable +# LOGWATCH_DIR. If that is not set, MK_CONFDIR is used. +# If that is not set either, the current directory ist +# used. +logwatch_dir = os.getenv("LOGWATCH_DIR") +if not logwatch_dir: + logwatch_dir = os.getenv("MK_CONFDIR") + if not logwatch_dir: + logwatch_dir = "." + +print "<<<logwatch>>>" + +config_filename = logwatch_dir + "/logwatch.cfg" +status_filename = logwatch_dir + "/logwatch.state" +config_dir = logwatch_dir + "/logwatch.d/*.cfg" + +def is_not_comment(line): + if line.lstrip().startswith('#') or \ + line.strip() == '': + return False + return True + +def parse_filenames(line): + return line.split() + +def parse_pattern(level, pattern): + if level not in [ 'C', 'W', 'I', 'O' ]: + raise(Exception("Invalid pattern line '%s'" % line)) + try: + compiled = re.compile(pattern) + except: + raise(Exception("Invalid regular expression in line '%s'" % line)) + return (level, compiled) + +def read_config(): + config_lines = [ line.rstrip() for line in filter(is_not_comment, file(config_filename).readlines()) ] + # Add config from a logwatch.d folder + for config_file in glob.glob(config_dir): + config_lines += [ line.rstrip() for line in filter(is_not_comment, file(config_file).readlines()) ] + + have_filenames = False + config = [] + + for line in config_lines: + rewrite = False + if line[0].isspace(): # pattern line + if not have_filenames: + raise Exception("Missing logfile names") + level, pattern = line.split(None, 1) + if level == 'A': + cont_list.append(parse_cont_pattern(pattern)) + elif level == 'R': + rewrite_list.append(pattern) + else: + level, compiled = parse_pattern(level, pattern) + cont_list = [] # List of continuation patterns + rewrite_list = [] # List of rewrite patterns + patterns.append((level, compiled, cont_list, rewrite_list)) + else: # filename line + patterns = [] + config.append((parse_filenames(line), patterns)) + have_filenames = True + return config + +def parse_cont_pattern(pattern): + try: + return int(pattern) + except: + try: + return re.compile(pattern) + except: + if debug: + raise + raise Exception("Invalid regular expression in line '%s'" % pattern) + +# structure of statusfile +# # LOGFILE OFFSET INODE +# /var/log/messages|7767698|32455445 +# /var/test/x12134.log|12345|32444355 +def read_status(): + if debug: + return {} + + status = {} + for line in file(status_filename): + # TODO: Remove variants with spaces. rsplit is + # not portable. split fails if logfilename contains + # spaces + inode = -1 + try: + parts = line.split('|') + filename = parts[0] + offset = parts[1] + if len(parts) >= 3: + inode = parts[2] + + except: + try: + filename, offset = line.rsplit(None, 1) + except: + filename, offset = line.split(None, 1) + status[filename] = int(offset), int(inode) + return status + +def save_status(status): + f = file(status_filename, "w") + for filename, (offset, inode) in status.items(): + f.write("%s|%d|%d\n" % (filename, offset, inode)) + +pushed_back_line = None +def next_line(f): + global pushed_back_line + if pushed_back_line != None: + line = pushed_back_line + pushed_back_line = None + return line + else: + try: + line = f.next() + return line + except: + return None + + +def process_logfile(logfile, patterns): + global pushed_back_line + + # Look at which file offset we have finished scanning + # the logfile last time. If we have never seen this file + # before, we set the offset to -1 + offset, prev_inode = status.get(logfile, (-1, -1)) + try: + fl = os.open(logfile, os.O_RDONLY) + inode = os.fstat(fl)[1] # 1 = st_ino + except: + if debug: + raise + print "[[[%s:cannotopen]]]" % logfile + return + + print "[[[%s]]]" % logfile + + # Seek to the current end in order to determine file size + current_end = os.lseek(fl, 0, 2) # os.SEEK_END not available in Python 2.4 + status[logfile] = current_end, inode + + # If we have never seen this file before, we just set the + # current pointer to the file end. We do not want to make + # a fuss about ancient log messages... + if offset == -1: + if not debug: + return + else: + offset = 0 + + + # If the inode of the logfile has changed it has appearently + # been started from new (logfile rotation). At least we must + # assume that. In some rare cases (restore of a backup, etc) + # we are wrong and resend old log messages + if prev_inode >= 0 and inode != prev_inode: + offset = 0 + + # Our previously stored offset is the current end -> + # no new lines in this file + if offset == current_end: + return # nothing new + + # If our offset is beyond the current end, the logfile has been + # truncated or wrapped while keeping the same inode. We assume + # that it contains all new data in that case and restart from + # offset 0. + if offset > current_end: + offset = 0 + + # now seek to offset where interesting data begins + os.lseek(fl, offset, 0) # os.SEEK_SET not available in Python 2.4 + f = os.fdopen(fl) + worst = -1 + outputtxt = "" + lines_parsed = 0 + start_time = time.time() + + while True: + line = next_line(f) + if line == None: + break # End of file + + lines_parsed += 1 + # Check if maximum number of new log messages is exceeded + if opt_maxlines != None and lines_parsed > opt_maxlines: + outputtxt += "%s Maximum number (%d) of new log messages exceeded.\n" % ( + opt_overflow, opt_maxlines) + worst = max(worst, opt_overflow_level) + os.lseek(fl, 0, 2) # Seek to end of file, skip all other messages + break + + # Check if maximum processing time (per file) is exceeded. Check only + # every 100'th line in order to save system calls + if opt_maxtime != None and lines_parsed % 100 == 10 \ + and time.time() - start_time > opt_maxtime: + outputtxt += "%s Maximum parsing time (%.1f sec) of this log file exceeded.\n" % ( + opt_overflow, opt_maxtime) + worst = max(worst, opt_overflow_level) + os.lseek(fl, 0, 2) # Seek to end of file, skip all other messages + break + + level = "." + for lev, pattern, cont_patterns, replacements in patterns: + matches = pattern.search(line[:-1]) + if matches: + level = lev + levelint = {'C': 2, 'W': 1, 'O': 0, 'I': -1, '.': -1}[lev] + worst = max(levelint, worst) + + # Check for continuation lines + for cont_pattern in cont_patterns: + if type(cont_pattern) == int: # add that many lines + for x in range(cont_pattern): + cont_line = next_line(f) + if cont_line == None: # end of file + break + line = line[:-1] + "\1" + cont_line + + else: # pattern is regex + while True: + cont_line = next_line(f) + if cont_line == None: # end of file + break + elif cont_pattern.search(cont_line[:-1]): + line = line[:-1] + "\1" + cont_line + else: + pushed_back_line = cont_line # sorry for stealing this line + break + + # Replacement + for replace in replacements: + line = replace.replace('\\0', line) + "\n" + for nr, group in enumerate(matches.groups()): + line = line.replace('\\%d' % (nr+1), group) + + break # matching rule found and executed + + color = {'C': tty_red, 'W': tty_yellow, 'O': tty_green, 'I': tty_blue, '.': ''}[level] + if debug: + line = line.replace("\1", "\nCONT:") + if level == "I": + level = "." + if opt_nocontext and level == '.': + continue + outputtxt += "%s%s %s%s\n" % (color, level, line[:-1], tty_normal) + + new_offset = os.lseek(fl, 0, 1) # os.SEEK_CUR not available in Python 2.4 + status[logfile] = new_offset, inode + + # output all lines if at least one warning, error or ok has been found + if worst > -1: + sys.stdout.write(outputtxt) + sys.stdout.flush() + +try: + config = read_config() +except Exception, e: + if debug: + raise + print "CANNOT READ CONFIG FILE: %s" % e + sys.exit(1) + +# Simply ignore errors in the status file. In case of a corrupted status file we simply begin +# with an empty status. That keeps the monitoring up and running - even if we might loose a +# message in the extreme case of a corrupted status file. +try: + status = read_status() +except Exception, e: + status = {} + + +# The filename line may contain options like 'maxlines=100' or 'maxtime=10' +for filenames, patterns in config: + # Initialize options with default values + opt_maxlines = None + opt_maxtime = None + opt_regex = None + opt_overflow = 'C' + opt_overflow_level = 2 + opt_nocontext = False + try: + options = [ o.split('=', 1) for o in filenames if '=' in o ] + for key, value in options: + if key == 'maxlines': + opt_maxlines = int(value) + elif key == 'maxtime': + opt_maxtime = float(value) + elif key == 'overflow': + if value not in [ 'C', 'I', 'W', 'O' ]: + raise Exception("Invalid value %s for overflow. Allowed are C, I, O and W" % value) + opt_overflow = value + opt_overflow_level = {'C':2, 'W':1, 'O':0, 'I':0}[value] + elif key == 'regex': + opt_regex = re.compile(value) + elif key == 'iregex': + opt_regex = re.compile(value, re.I) + elif key == 'nocontext': + opt_nocontext = True + else: + raise Exception("Invalid option %s" % key) + except Exception, e: + if debug: + raise + print "INVALID CONFIGURATION: %s" % e + sys.exit(1) + + + for glob in filenames: + if '=' in glob: + continue + logfiles = [ l.strip() for l in os.popen("ls %s 2>/dev/null" % glob).readlines() ] + if opt_regex: + logfiles = [ f for f in logfiles if opt_regex.search(f) ] + if len(logfiles) == 0: + print '[[[%s:missing]]]' % glob + else: + for logfile in logfiles: + process_logfile(logfile, patterns) + +if not debug: + save_status(status) diff --git a/puppet/modules/site_check_mk/files/extra_service_conf.mk b/puppet/modules/site_check_mk/files/extra_service_conf.mk new file mode 100644 index 00000000..c7120a96 --- /dev/null +++ b/puppet/modules/site_check_mk/files/extra_service_conf.mk @@ -0,0 +1,14 @@ +# retry 3 times before setting a service into a hard state +# and send out notification +extra_service_conf["max_check_attempts"] = [ + ("4", ALL_HOSTS , ALL_SERVICES ) +] + +# +# run check_mk_agent every 4 minutes if it terminates successfully. +# see https://leap.se/code/issues/6539 for the rationale +# +extra_service_conf["normal_check_interval"] = [ + ("4", ALL_HOSTS , "Check_MK" ) +] + diff --git a/puppet/modules/site_check_mk/files/ignored_services.mk b/puppet/modules/site_check_mk/files/ignored_services.mk new file mode 100644 index 00000000..35dc4433 --- /dev/null +++ b/puppet/modules/site_check_mk/files/ignored_services.mk @@ -0,0 +1,3 @@ +ignored_services = [ + ( ALL_HOSTS, [ "NTP Time" ] ) +] diff --git a/puppet/modules/site_check_mk/manifests/agent.pp b/puppet/modules/site_check_mk/manifests/agent.pp new file mode 100644 index 00000000..b95d5d64 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent.pp @@ -0,0 +1,35 @@ +# installs check-mk agent +class site_check_mk::agent { + + $ssh_hash = hiera('ssh') + $pubkey = $ssh_hash['authorized_keys']['monitor']['key'] + $type = $ssh_hash['authorized_keys']['monitor']['type'] + + + # /usr/bin/mk-job depends on /usr/bin/time + ensure_packages('time') + + class { 'site_apt::preferences::check_mk': } -> + + class { 'check_mk::agent': + agent_package_name => 'check-mk-agent', + agent_logwatch_package_name => 'check-mk-agent-logwatch', + method => 'ssh', + authdir => '/root/.ssh', + authfile => 'authorized_keys', + register_agent => false, + require => Package['time'] + } -> + + class { 'site_check_mk::agent::mrpe': } -> + class { 'site_check_mk::agent::logwatch': } -> + + file { + [ '/srv/leap/nagios', '/srv/leap/nagios/plugins' ]: + ensure => directory; + '/usr/lib/check_mk_agent/local/run_node_tests.sh': + source => 'puppet:///modules/site_check_mk/agent/local_checks/all_hosts/run_node_tests.sh', + mode => '0755'; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/couchdb.pp b/puppet/modules/site_check_mk/manifests/agent/couchdb.pp new file mode 100644 index 00000000..1554fd3c --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/couchdb.pp @@ -0,0 +1,34 @@ +# configure logwatch and nagios checks for couchdb (both bigcouch and plain +# couchdb installations) +class site_check_mk::agent::couchdb { + + concat::fragment { 'syslog_couchdb': + source => 'puppet:///modules/site_check_mk/agent/logwatch/syslog/couchdb.cfg', + target => '/etc/check_mk/logwatch.d/syslog.cfg', + order => '02'; + } + + # check different couchdb stats + file { '/usr/lib/check_mk_agent/local/leap_couch_stats.sh': + source => 'puppet:///modules/site_check_mk/agent/local_checks/couchdb/leap_couch_stats.sh', + mode => '0755', + require => Package['check_mk-agent'] + } + + # check open files for bigcouch proc + include site_check_mk::agent::package::perl_plugin + file { '/srv/leap/nagios/plugins/check_unix_open_fds.pl': + source => 'puppet:///modules/site_check_mk/agent/nagios_plugins/check_unix_open_fds.pl', + mode => '0755' + } + augeas { + 'Couchdb_open_files': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/Couchdb_open_files', + 'set Couchdb_open_files \'/srv/leap/nagios/plugins/check_unix_open_fds.pl -a beam -w 28672,28672 -c 30720,30720\'' ], + require => File['/etc/check_mk/mrpe.cfg']; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/couchdb/bigcouch.pp b/puppet/modules/site_check_mk/manifests/agent/couchdb/bigcouch.pp new file mode 100644 index 00000000..82c3ac72 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/couchdb/bigcouch.pp @@ -0,0 +1,49 @@ +# configure logwatch and nagios checks for bigcouch +class site_check_mk::agent::couchdb::bigcouch { + + # watch bigcouch logs + # currently disabled because bigcouch is too noisy + # see https://leap.se/code/issues/7375 for more details + # and site_config::remove_files for removing leftovers + #file { '/etc/check_mk/logwatch.d/bigcouch.cfg': + # source => 'puppet:///modules/site_check_mk/agent/logwatch/bigcouch.cfg', + #} + + # check syslog msg from: + # - empd + # - /usr/local/bin/couch-doc-update + concat::fragment { 'syslog_bigcouch': + source => 'puppet:///modules/site_check_mk/agent/logwatch/syslog/bigcouch.cfg', + target => '/etc/check_mk/logwatch.d/syslog.cfg', + order => '02'; + } + + # check bigcouch processes + augeas { + 'Bigcouch_epmd_procs': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/Bigcouch_epmd_procs', + 'set Bigcouch_epmd_procs \'/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -a /opt/bigcouch/erts-5.9.1/bin/epmd\'' ], + require => File['/etc/check_mk/mrpe.cfg']; + 'Bigcouch_beam_procs': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/Bigcouch_beam_procs', + 'set Bigcouch_beam_procs \'/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -a /opt/bigcouch/erts-5.9.1/bin/beam\'' ], + require => File['/etc/check_mk/mrpe.cfg']; + } + + augeas { + 'Bigcouch_open_files': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/Bigcouch_open_files', + 'set Bigcouch_open_files \'/srv/leap/nagios/plugins/check_unix_open_fds.pl -a beam -w 28672,28672 -c 30720,30720\'' ], + require => File['/etc/check_mk/mrpe.cfg']; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/couchdb/plain.pp b/puppet/modules/site_check_mk/manifests/agent/couchdb/plain.pp new file mode 100644 index 00000000..3ec2267b --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/couchdb/plain.pp @@ -0,0 +1,23 @@ +# configure logwatch and nagios checks for plain single couchdb master +class site_check_mk::agent::couchdb::plain { + + # remove bigcouch leftovers + augeas { + 'Bigcouch_epmd_procs': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => 'rm /files/etc/check_mk/mrpe.cfg/Bigcouch_epmd_procs', + require => File['/etc/check_mk/mrpe.cfg']; + 'Bigcouch_beam_procs': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => 'rm /files/etc/check_mk/mrpe.cfg/Bigcouch_beam_procs', + require => File['/etc/check_mk/mrpe.cfg']; + 'Bigcouch_open_files': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => 'rm /files/etc/check_mk/mrpe.cfg/Bigcouch_open_files', + require => File['/etc/check_mk/mrpe.cfg']; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/haproxy.pp b/puppet/modules/site_check_mk/manifests/agent/haproxy.pp new file mode 100644 index 00000000..6d52efba --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/haproxy.pp @@ -0,0 +1,15 @@ +class site_check_mk::agent::haproxy { + + include site_check_mk::agent::package::nagios_plugins_contrib + + # local nagios plugin checks via mrpe + augeas { 'haproxy': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/Haproxy', + 'set Haproxy \'/usr/lib/nagios/plugins/check_haproxy -u "http://localhost:8000/haproxy;csv"\'' ], + require => File['/etc/check_mk/mrpe.cfg']; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/haveged.pp b/puppet/modules/site_check_mk/manifests/agent/haveged.pp new file mode 100644 index 00000000..cacbea8c --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/haveged.pp @@ -0,0 +1,15 @@ +class site_check_mk::agent::haveged { + +# check haveged process + augeas { + 'haveged_proc': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/haveged_proc', + 'set haveged_proc \'/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -a /usr/sbin/haveged\'' ], + require => File['/etc/check_mk/mrpe.cfg']; + + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/logwatch.pp b/puppet/modules/site_check_mk/manifests/agent/logwatch.pp new file mode 100644 index 00000000..423cace2 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/logwatch.pp @@ -0,0 +1,36 @@ +class site_check_mk::agent::logwatch { + # Deploy mk_logwatch 1.2.4 so we can split the config + # into multiple config files in /etc/check_mk/logwatch.d + # see https://leap.se/code/issues/5135 + + file { '/usr/lib/check_mk_agent/plugins/mk_logwatch': + source => 'puppet:///modules/site_check_mk/agent/plugins/mk_logwatch.1.2.4', + mode => '0755', + require => Package['check-mk-agent-logwatch'] + } + + # only config files that watch a distinct logfile should go in logwatch.d/ + file { '/etc/check_mk/logwatch.d': + ensure => directory, + recurse => true, + purge => true, + require => Package['check-mk-agent-logwatch'] + } + + # service that share a common logfile (i.e. /var/log/syslog) need to get + # concanated in one file, otherwise the last file sourced will override + # the config before + # see mk_logwatch: "logwatch.cfg overwrites config files in logwatch.d", + # https://leap.se/code/issues/5155 + + # first, we need to deploy a custom logwatch.cfg that doesn't include + # a section about /var/log/syslog + + file { '/etc/check_mk/logwatch.cfg': + source => 'puppet:///modules/site_check_mk/agent/logwatch/logwatch.cfg', + require => Package['check_mk-agent-logwatch'] + } + + include concat::setup + include site_check_mk::agent::logwatch::syslog +} diff --git a/puppet/modules/site_check_mk/manifests/agent/logwatch/syslog.pp b/puppet/modules/site_check_mk/manifests/agent/logwatch/syslog.pp new file mode 100644 index 00000000..c927780d --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/logwatch/syslog.pp @@ -0,0 +1,18 @@ +class site_check_mk::agent::logwatch::syslog { + + concat { '/etc/check_mk/logwatch.d/syslog.cfg': + warn => true + } + + concat::fragment { 'syslog_header': + source => 'puppet:///modules/site_check_mk/agent/logwatch/syslog_header.cfg', + target => '/etc/check_mk/logwatch.d/syslog.cfg', + order => '01'; + } + concat::fragment { 'syslog_tail': + source => 'puppet:///modules/site_check_mk/agent/logwatch/syslog_tail.cfg', + target => '/etc/check_mk/logwatch.d/syslog.cfg', + order => '99'; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/mrpe.pp b/puppet/modules/site_check_mk/manifests/agent/mrpe.pp new file mode 100644 index 00000000..5e1f087a --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/mrpe.pp @@ -0,0 +1,24 @@ +class site_check_mk::agent::mrpe { + # check_mk can use standard nagios plugins using + # a wrapper called mrpe + # see http://mathias-kettner.de/checkmk_mrpe.html + + package { 'nagios-plugins-basic': + ensure => latest, + } + + file { '/etc/check_mk/mrpe.cfg': + ensure => present, + require => Package['check-mk-agent'] + } -> + + augeas { + 'Apt': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/APT', + 'set APT \'/usr/lib/nagios/plugins/check_apt\'' ]; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/mx.pp b/puppet/modules/site_check_mk/manifests/agent/mx.pp new file mode 100644 index 00000000..20cbcade --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/mx.pp @@ -0,0 +1,27 @@ +# check check_mk agent checks for mx service +class site_check_mk::agent::mx { + + # watch logs + file { '/etc/check_mk/logwatch.d/leap_mx.cfg': + source => 'puppet:///modules/site_check_mk/agent/logwatch/leap_mx.cfg', + } + + # local nagios plugin checks via mrpe + # removed because leap_cli integrates a check for running mx procs already, + # which is also integrated into nagios (called "Mx/Are_MX_daemons_running") + augeas { + 'Leap_MX_Procs': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => 'rm /files/etc/check_mk/mrpe.cfg/Leap_MX_Procs', + require => File['/etc/check_mk/mrpe.cfg']; + } + + # check stale files in queue dir + file { '/usr/lib/check_mk_agent/local/check_leap_mx.sh': + source => 'puppet:///modules/site_check_mk/agent/local_checks/mx/check_leap_mx.sh', + mode => '0755', + require => Package['check_mk-agent'] + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/openvpn.pp b/puppet/modules/site_check_mk/manifests/agent/openvpn.pp new file mode 100644 index 00000000..0596a497 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/openvpn.pp @@ -0,0 +1,10 @@ +class site_check_mk::agent::openvpn { + + # check syslog + concat::fragment { 'syslog_openpvn': + source => 'puppet:///modules/site_check_mk/agent/logwatch/openvpn.cfg', + target => '/etc/check_mk/logwatch.d/syslog.cfg', + order => '02'; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/package/nagios_plugins_contrib.pp b/puppet/modules/site_check_mk/manifests/agent/package/nagios_plugins_contrib.pp new file mode 100644 index 00000000..95a60d17 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/package/nagios_plugins_contrib.pp @@ -0,0 +1,5 @@ +class site_check_mk::agent::package::nagios_plugins_contrib { + package { 'nagios-plugins-contrib': + ensure => installed, + } +} diff --git a/puppet/modules/site_check_mk/manifests/agent/package/perl_plugin.pp b/puppet/modules/site_check_mk/manifests/agent/package/perl_plugin.pp new file mode 100644 index 00000000..4feda375 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/package/perl_plugin.pp @@ -0,0 +1,5 @@ +class site_check_mk::agent::package::perl_plugin { + package { 'libnagios-plugin-perl': + ensure => installed, + } +} diff --git a/puppet/modules/site_check_mk/manifests/agent/soledad.pp b/puppet/modules/site_check_mk/manifests/agent/soledad.pp new file mode 100644 index 00000000..f4a3f3a6 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/soledad.pp @@ -0,0 +1,17 @@ +class site_check_mk::agent::soledad { + + file { '/etc/check_mk/logwatch.d/soledad.cfg': + source => 'puppet:///modules/site_check_mk/agent/logwatch/soledad.cfg', + } + + # local nagios plugin checks via mrpe + + augeas { 'Soledad_Procs': + incl => '/etc/check_mk/mrpe.cfg', + lens => 'Spacevars.lns', + changes => [ + 'rm /files/etc/check_mk/mrpe.cfg/Soledad_Procs', + 'set Soledad_Procs \'/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -a "/usr/bin/python /usr/bin/twistd --uid=soledad --gid=soledad --pidfile=/var/run/soledad.pid --logfile=/var/log/soledad.log web --wsgi=leap.soledad.server.application --port=ssl:2323:privateKey=/etc/x509/keys/leap.key:certKey=/etc/x509/certs/leap.crt:sslmethod=SSLv23_METHOD"\'' ], + require => File['/etc/check_mk/mrpe.cfg']; + } +} diff --git a/puppet/modules/site_check_mk/manifests/agent/stunnel.pp b/puppet/modules/site_check_mk/manifests/agent/stunnel.pp new file mode 100644 index 00000000..7f765771 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/stunnel.pp @@ -0,0 +1,9 @@ +class site_check_mk::agent::stunnel { + + concat::fragment { 'syslog_stunnel': + source => 'puppet:///modules/site_check_mk/agent/logwatch/stunnel.cfg', + target => '/etc/check_mk/logwatch.d/syslog.cfg', + order => '02'; + } + +} diff --git a/puppet/modules/site_check_mk/manifests/agent/webapp.pp b/puppet/modules/site_check_mk/manifests/agent/webapp.pp new file mode 100644 index 00000000..9bf3b197 --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/agent/webapp.pp @@ -0,0 +1,15 @@ +class site_check_mk::agent::webapp { + + # remove leftovers of webapp python checks + file { + [ '/usr/lib/check_mk_agent/local/nagios-webapp_login.py', + '/usr/lib/check_mk_agent/local/soledad_sync.py' ]: + ensure => absent + } + + # watch logs + file { '/etc/check_mk/logwatch.d/webapp.cfg': + source => 'puppet:///modules/site_check_mk/agent/logwatch/webapp.cfg', + } + +} diff --git a/puppet/modules/site_check_mk/manifests/server.pp b/puppet/modules/site_check_mk/manifests/server.pp new file mode 100644 index 00000000..7ff9eb4a --- /dev/null +++ b/puppet/modules/site_check_mk/manifests/server.pp @@ -0,0 +1,103 @@ +# setup check_mk on the monitoring server +class site_check_mk::server { + + $ssh_hash = hiera('ssh') + $pubkey = $ssh_hash['authorized_keys']['monitor']['key'] + $type = $ssh_hash['authorized_keys']['monitor']['type'] + $seckey = $ssh_hash['monitor']['private_key'] + + $nagios_hiera = hiera_hash('nagios') + $hosts = $nagios_hiera['hosts'] + + $all_hosts = inline_template ('<% @hosts.keys.sort.each do |key| -%><% if @hosts[key]["environment"] != "disabled" %>"<%= @hosts[key]["domain_internal"] %>", <% end -%><% end -%>') + $domains_internal = $nagios_hiera['domains_internal'] + $environments = $nagios_hiera['environments'] + + package { 'check-mk-server': + ensure => installed, + } + + # we don't use check-mk-multisite, and the jessie version + # of this config file breaks with apache 2.4 + # until https://gitlab.com/shared-puppet-modules-group/apache/issues/11 + # is not fixed, we need to use a generic file type here + #apache::config::global { 'check-mk-multisite.conf': + # ensure => absent + #} + + file { '/etc/apache2/conf-enabled/check-mk-multisite.conf': + ensure => absent, + require => Package['check-mk-server']; + } + + # override paths to use the system check_mk rather than OMD + class { 'check_mk::config': + site => '', + etc_dir => '/etc', + nagios_subdir => 'nagios3', + bin_dir => '/usr/bin', + host_groups => undef, + use_storedconfigs => false, + inventory_only_on_changes => false, + require => Package['check-mk-server'] + } + + Exec['check_mk-refresh'] -> + Exec['check_mk-refresh-inventory-daily'] -> + Exec['check_mk-reload'] -> + Service['nagios'] + + file { + '/etc/check_mk/conf.d/use_ssh.mk': + content => template('site_check_mk/use_ssh.mk'), + notify => Exec['check_mk-refresh'], + require => Package['check-mk-server']; + '/etc/check_mk/conf.d/hostgroups.mk': + content => template('site_check_mk/hostgroups.mk'), + notify => Exec['check_mk-refresh'], + require => Package['check-mk-server']; + '/etc/check_mk/conf.d/host_contactgroups.mk': + content => template('site_check_mk/host_contactgroups.mk'), + notify => Exec['check_mk-refresh'], + require => Package['check-mk-server']; + '/etc/check_mk/conf.d/ignored_services.mk': + source => 'puppet:///modules/site_check_mk/ignored_services.mk', + notify => Exec['check_mk-refresh'], + require => Package['check-mk-server']; + '/etc/check_mk/conf.d/extra_service_conf.mk': + source => 'puppet:///modules/site_check_mk/extra_service_conf.mk', + notify => Exec['check_mk-refresh'], + require => Package['check-mk-server']; + '/etc/check_mk/conf.d/extra_host_conf.mk': + content => template('site_check_mk/extra_host_conf.mk'), + notify => Exec['check_mk-refresh'], + require => Package['check-mk-server']; + + '/etc/check_mk/all_hosts_static': + content => $all_hosts, + notify => Exec['check_mk-refresh'], + require => Package['check-mk-server']; + + '/etc/check_mk/.ssh': + ensure => directory, + require => Package['check-mk-server']; + '/etc/check_mk/.ssh/id_rsa': + content => $seckey, + owner => 'nagios', + mode => '0600', + require => Package['check-mk-server']; + '/etc/check_mk/.ssh/id_rsa.pub': + content => "${type} ${pubkey} monitor", + owner => 'nagios', + mode => '0644', + require => Package['check-mk-server']; + + # check_icmp must be suid root or called by sudo + # see https://leap.se/code/issues/5171 + '/usr/lib/nagios/plugins/check_icmp': + mode => '4755', + require => Package['nagios-plugins-basic']; + } + + include check_mk::agent::local_checks +} diff --git a/puppet/modules/site_check_mk/templates/extra_host_conf.mk b/puppet/modules/site_check_mk/templates/extra_host_conf.mk new file mode 100644 index 00000000..bc27b514 --- /dev/null +++ b/puppet/modules/site_check_mk/templates/extra_host_conf.mk @@ -0,0 +1,13 @@ +# retry 3 times before setting a host into a hard state +# and send out notification +extra_host_conf["max_check_attempts"] = [ + ("4", ALL_HOSTS ) +] + +# Use hostnames as alias so notification mail subjects +# are more readable and not so long. Alias defaults to +# the fqdn of a host is not changed. +extra_host_conf["alias"] = [ +<% @hosts.keys.sort.each do |key| -%> ( "<%= key.strip %>", ["<%= @hosts[key]['domain_internal']%>"]), +<% end -%> +] diff --git a/puppet/modules/site_check_mk/templates/host_contactgroups.mk b/puppet/modules/site_check_mk/templates/host_contactgroups.mk new file mode 100644 index 00000000..6a534967 --- /dev/null +++ b/puppet/modules/site_check_mk/templates/host_contactgroups.mk @@ -0,0 +1,17 @@ +<% + contact_groups = [] + @environments.keys.sort.each do |env_name| + hosts = "" + @nagios_hosts.keys.sort.each do |hostname| + hostdata = @nagios_hosts[hostname] + domain_internal = hostdata['domain_internal'] + if hostdata['environment'] == env_name + hosts << '"' + domain_internal + '", ' + end + end + contact_groups << ' ( "%s", [%s] )' % [env_name, hosts] + end +%> +host_contactgroups = [ +<%= contact_groups.join(",\n") %> +] diff --git a/puppet/modules/site_check_mk/templates/hostgroups.mk b/puppet/modules/site_check_mk/templates/hostgroups.mk new file mode 100644 index 00000000..7158dcd1 --- /dev/null +++ b/puppet/modules/site_check_mk/templates/hostgroups.mk @@ -0,0 +1,17 @@ +<% + host_groups = [] + @environments.keys.sort.each do |env_name| + hosts = "" + @nagios_hosts.keys.sort.each do |hostname| + hostdata = @nagios_hosts[hostname] + domain_internal = hostdata['domain_internal'] + if hostdata['environment'] == env_name + hosts << '"' + domain_internal + '", ' + end + end + host_groups << ' ( "%s", [%s] )' % [env_name, hosts] + end +%> +host_groups = [ +<%= host_groups.join(",\n") %> +] diff --git a/puppet/modules/site_check_mk/templates/use_ssh.mk b/puppet/modules/site_check_mk/templates/use_ssh.mk new file mode 100644 index 00000000..55269536 --- /dev/null +++ b/puppet/modules/site_check_mk/templates/use_ssh.mk @@ -0,0 +1,6 @@ +# http://mathias-kettner.de/checkmk_datasource_programs.html +datasource_programs = [ +<% @nagios_hosts.sort.each do |name,config| %> + ( "ssh -l root -i /etc/check_mk/.ssh/id_rsa -p <%=config['ssh_port']%> <%=config['domain_internal']%> check_mk_agent", [ "<%=config['domain_internal']%>" ], ),<%- end -%> + +] |