#!/bin/bash # # in case the script gets canceled, use a trap to finally remove the # lockfile that indicates a running process trap "cleanup" INT TERM shopt -s expand_aliases alias strip_colors='sed -r "s/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g"' usage() { cat << EOF usage: $0 [options] command [arguments...] This script runs the leap platform deploy tests OPTIONS -a|--all run command on all nodes -c|--config file specify config file -h|--help show help -l|--lock refuse to deploy if lockfile from previous failures exists -v|--verbose show also successful puppet resource changes, not only errors -V|--versions show versions/git revision of leap_cli and leap_platoform in provider dir COMMANDS bootstrap bootstrap node(s): - leap local start - leap node int - sets up hostname and runs apt-get dist-upgrade - leap local save create_provider creates a provider instance deploy deploy node(s) init_deploy initialize node, then do a deploy destroy_deploy destroy vms, init, and deploy reset_deploy reset and deploy node(s) test run leap test EOF } add_nodes() { suffix=$IP_SUFFIX_START for i in "$@" do node=${i%:*} services=${i#*:} let suffix++ ip="${IP_PREFIX}.$suffix" if [[ "$services" =~ "openvpn" ]] then config="openvpn.gateway_address:${IP_PREFIX}.98 openvpn.second_gateway_address:${IP_PREFIX}.99" else config='' fi rm "nodes/${node}.json" $LEAP_CMD node add --local "$node" ip_address:"$ip" $config services:"$services" tags:"$TAG" done } destroy_vms() { for vm in "$@" do $LEAP_CMD local destroy "$vm" done } bootstrap_nodes() { $LEAP_CMD $OPTS local start for vm in $(echo $nodes) do wait_for_node $vm $LEAP_CMD $OPTS node init $vm # set hostname + do dist-upgrade # deploying these classes have too much dependencies, # this is done now by the "~/vagrant/bootstrap_only.sh" provisioner #$LEAP_CMD $OPTS deploy "$vm" --tags site_apt::dist_upgrade,site_config::hosts,site_squid_deb_proxy::client done # make sure machines are rebooted in order to be able to load kernel modules after a kernel update # https://leap.se/code/issues/6494 cd "${PROVIDERDIR}/test" vagrant reload cd .. #$LEAP_CMD $OPTS local save } cleanup () { echo "cleaning up..." [ -e "$LOCKFILE" ] && rm "$LOCKFILE" } check_for_running_instances() { if [[ -f $LOCKFILE ]] then echo "Lockfile found at $LOCKFILE - maybe other process(es) found running for $(basename "$0") - exiting. Please investigate and then remove lockfile." exit 1 fi if [[ -f $FAILURE_LOCKFILE ]] then if [[ $lock ]] then subj="WARNING: CI failure lockfile found for branch ${PLATFORM_BRANCH} - previous deploy tests failed !" msg="CI lock found, and --lock in use. This means that leap test failed on the previous run.\n Please investigate and then remove $FAILURE_LOCKFILE\n\n" if [ "$MAIL_TO" != '' ]; then echo "$subj Sending mail to $MAIL_TO:" sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}" fi exit 1 else rm "$FAILURE_LOCKFILE" fi fi } create_provider() { if [ -e "$PROVIDERDIR" ] then echo "$PROVIDERDIR" exists - exiting exit 1 fi git clone -b "$PLATFORM_BRANCH" --recursive https://leap.se/git/leap_platform.git "$PLATFORMDIR" mkdir -p "$PROVIDERDIR" cd "$PROVIDERDIR" $LEAP_CMD $OPTS new --contacts "$CONTACTS" --domain "$DOMAIN" --name "$PROVIDER" --platform="$PLATFORMDIR" . # for now, we use the vagrant pubkey until https://leap.se/code/issues/2039 is solved $LEAP_CMD $OPTS add-user --self --ssh-pub-key="$SSHKEY" $LEAP_CMD $OPTS cert ca && $LEAP_CMD $OPTS cert csr # copy for faster testing $LEAP_CMD $OPTS cert dh add_nodes $NODES git init git add . git commit -m"finished create_provider" } # deploys to all nodes in a given env deploy() { returncode=0 # we need to deploy with verbose level 2, and filter out unwanted stuff # until puppet errors show up in verbose level 0 +1 (#1750) FILTER_CLI=' - \[.*\] Changed /etc/hostname to| - \[.*\] Changed hostname to|= read|= loading|= no change| - executing| = executing| = applying| = ran git| = checking| = synching| = skipping file_path| - cd .*; rsync -| - hiera| = created | = updated hiera/| = updated secrets.json| - cd /root/| - rolling backexecuting| - files/|\[bin,tests,puppet\] ->|] Hostname updated.| = Updating submodule puppet/modules|Warning: Permanently added.*to the list of known hosts.| = leap command v| = leap platform v| - \[.*\] ok| - \[.*\] STARTING APPLY| - \[.*\] APPLY COMPLETE|net.ssh.authentication.agent.*could not connect to ssh-agent|net.ssh.service.forward.*: could not establish forwarding of authentication agent|Deploying | - mx/dkim.pub, mx/dkim.key -> .*/srv/leap/files' FILTER_PUPPET="] notice: |^Notice: |] No change to hostname|] Puppet apply complete \(changes made\).|] completed in |] warning: Dynamic lookup|] warning: Scope\(Class|Skipping because of failed dependencies|warning: You cannot collect without storeconfigs being set|warning: Not collecting exported resources without storeconfigs|warning: default \`to_a' will be obsolete|Warning: Found multiple default providers for vcsrepo" if [ -n "$FILTER_COMMON" ] then FILTER_ALL="($FILTER_CLI|$FILTER_PUPPET|$FILTER_COMMON)" else FILTER_ALL="($FILTER_CLI|$FILTER_PUPPET)" fi [ -e "$LOGDIR" ] || mkdir -p "$LOGDIR" date=$( date +"%F-%H%M%S" ) LOG1="$LOGDIR/$TAG.log" LOG2="$LOGDIR/deploy-$TAG-$date.log" ERRLOG1="$LOGDIR/deploy-$TAG-$date-error.log" ERRLOG2="$LOGDIR/deploy-error.log" echo "Deploying tag \"$TAG\" on $( date )"|tee -a "$LOG1" "$LOG2" if $verbose then $LEAP_CMD "$OPTS" -v 2 deploy "$TAG" 2>&1 | tee -a "$LOG1" "$LOG2" "$LOG_GLOBAL" strip_colors < "$LOG2" | egrep -iv "$FILTER_ALL" | tee -a "$ERRLOG1" "$ERRLOG2" "$ERRLOG_GLOBAL" > /dev/null else $LEAP_CMD "$OPTS" -v 2 deploy "$TAG" 2>&1 | tee -a "$LOG1" "$LOG2" "$LOG_GLOBAL" | strip_colors | egrep -iv "$FILTER_ALL" | tee -a "$ERRLOG1" "$ERRLOG2" "$ERRLOG_GLOBAL" > /dev/null fi # send an tag-specific error mail on deploy failures if [ -s "$ERRLOG1" ] then touch "$FAILURE_LOCKFILE" returncode=1 versions=$( versions ) subj="WARNING - \"leap deploy\" of platform $PLATFORM_BRANCH branch on tag \"$TAG\" had errors !" echo "$( date ): $msg" | tee -a "$LOG1" "$LOG2" "$ERRLOG2" echo msg="Output of error log below:\n\n$( cat "$ERRLOG1" ) \n\n" msg="${msg}-------------------------------------------------------------------\n\n" msg="${msg}error log: ${ERRLOG1}\n" msg="${msg}comlete log: ${LOG2}\n\n" msg="${msg}Tested on $( date ) on tag \"$TAG\" with following versions/git commit IDs: \n\n$versions" cat "$ERRLOG1" if [ "$MAIL_TO" != '' ]; then echo "Sending this mail to $MAIL_TO:" sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}" fi else echo "Deploy to tag $TAG on $( date ) went fine."| tee -a "$LOG1" "$LOG2" rm "$ERRLOG1" fi return $returncode } get_ip () { grep ip_address "$PROVIDERDIR/nodes/$1.json" |cut -f 2 -d:|sed 's/[ ",]//g' } ip_pingable () { ping -q -W10 -c1 "$1" >/dev/null 2>&1 return $? } log_start() { echo echo "Starting $0 on $( date )" } deploy_failure_email() { # only send out a mail on success, because there are mails send out # for every hosts that has deploy errors anyway if [ ! -e "$FAILURE_LOCKFILE" -a -n "$MAIL_TO" ] then versions=$( versions ) subj="OK - \"leap deploy\" of platform $PLATFORM_BRANCH branch went fine." msg="Tested on $( date ) on these nodes: \"$nodes\"\nwith following versions/git commit IDs: \n\n$versions" echo "Sending deploy success mail to $MAIL_TO" sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}" fi } test_failure_email() { test_failure=$? versions=$( versions ) msg="Tested on $( date ) on these nodes: \"$nodes\"\nwith following versions/git commit IDs: \n\n$versions" if [ $test_failure -eq 0 ] then subj="OK - \"leap test\" of platform $PLATFORM_BRANCH branch is all green." else subj="WARNING - \"leap test\" of platform $PLATFORM_BRANCH branch failed !!" fi echo "$subj" echo "Sending test mail to $MAIL_TO" # unfortunatly, no tls atm, fixed in sendemail 1.56-3 # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=679911 sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}" -a "$TEST_LOG2" } init_deploy() { # init_deploy tests that a re-init of a node, then a deploy works this # accepts that the limited tagged deploy that happens in bootstrap_nodes() # is ok. The following steps happen: # #. run bootstrap_nodes (this will: start vm, run init, provision) #. deploy the nodes #. send email cd "$PROVIDERDIR" log_start echo "Starting init_deploy for nodes $* as background tasks on $( date )" $LEAP_CMD $OPTS local start #$LEAP_CMD $OPTS local reset wait_for_nodes bootstrap_nodes # run cutom provisioning script, i.e. specified in ~/.leaprc cd test vagrant provision cd .. deploy if [ -e "$FAILURE_LOCKFILE" ] then print_global_errorlog deploy_failure_email returncode=1 fi summary return $returncode } reset_deploy() { returncode=0 # reset_deploy tests that a deploy works after the node has been reset to # the bootstrap_nodes() state it does not re-run bootstrap_nodes(), so a # 'leap node init' is not run again, this is usually sufficient. The # following steps happen: # #. reset the nodes to their saved state (saved is after bootstrap_nodes has # been run) #. deploy the nodes #. send email if MAIL_TO is set cd "$PROVIDERDIR" log_start echo "Starting reset_deploy on $(date)" $LEAP_CMD $OPTS local start $LEAP_CMD $OPTS local reset wait_for_nodes # run cutom provisioning script, i.e. specified in ~/.leaprc cd test vagrant provision cd .. deploy if [ -e "$FAILURE_LOCKFILE" ] then print_global_errorlog deploy_failure_email returncode=1 fi summary return $returncode } simple_deploy() { returncode=0 # Simple deploy to an existing,initialized and optional already deployed node. # Following steps happen: # #. deploy the nodes #. send email if MAIL_TO is set cd "$PROVIDERDIR" log_start echo "Starting simple_deploy on $(date)" $LEAP_CMD $OPTS local start wait_for_nodes # run cutom provisioning script, i.e. specified in ~/.leaprc cd test vagrant provision cd .. deploy if [ -e "$FAILURE_LOCKFILE" ] then print_global_errorlog deploy_failure_email returncode=1 fi summary return $returncode } destroy_deploy() { # destroy_deploy tests a full-cycle, it destroys the VMs, re-creates them # from scratch and then bootstraps them. The following steps happen: #. make sure the platform and leap cli are up-to-date #. destroy the vms #. run bootstrap_nodes (this will: start vm, run init, run a limited tag # deploy, then reset the node) #. deploy the nodes #. send email local nodes="$*" update_leap_cli cd "$PROVIDERDIR" log_start echo "Starting destroy_deploy for nodes $* as background tasks on $(date)" destroy_vms "$@" for i in $nodes do bootstrap_nodes "$i" # run cutom provisioning script, i.e. specified in ~/.leaprc cd test vagrant provision "$i" cd .. deploy "$i" & done # needed in a detached screen session, otherwise it would terminate before deploy jobs # have finished echo "Waiting until last deploy process has finished..." wait if [ -e "$FAILURE_LOCKFILE" ] then print_global_errorlog deploy_failure_email returncode=1 fi summary return $returncode } print_global_errorlog () { COLOR="\033[31m" PLAIN="\033[0m" echo -e "${COLOR}" echo -e "\n==================================================================\n" echo "Errors during deploy:" strip_colors < "$ERRLOG_GLOBAL" echo -e "\n==================================================================\n\n" echo -e "${PLAIN}" } summary () { echo "Complete deploy log: $LOG_GLOBAL" [ -f "$ERRLOG_GLOBAL" ] && echo "Error deploy log: $ERRLOG_GLOBAL" versions } ssh_up () { nc -w 4 "$1" 22 > /dev/null return $? } run_tests () { date=$( date +"%F-%H%M%S" ) TEST_FILTER='net.ssh.authentication.agent.*could not connect to ssh-agent' echo -e "\nRunning leap test on $date" | tee -a "$TEST_LOG1" "$TEST_LOG2" if $verbose then $LEAP_CMD $OPTS test "$TAG" --continue 2>&1 | tee -a "$TEST_LOG1" "$TEST_LOG2" test_failure=${PIPESTATUS[0]} egrep -iv "$TEST_FILTER" $TEST_LOG2 | tee -a "$TEST_LOG1" "$TEST_LOG2" > /dev/null else $LEAP_CMD $OPTS test "$TAG" --continue 2>&1 | egrep -iv "$TEST_FILTER" | tee -a "$TEST_LOG1" "$TEST_LOG2" test_failure=${PIPESTATUS[0]} fi if [ "$test_failure" -ne 0 ] then echo 'WARNING - "leap test" failed !' | tee -a "$TEST_LOG1" "$TEST_LOG2" touch "$FAILURE_LOCKFILE" else echo 'OK - "leap test" is all green !' | tee -a "$TEST_LOG1" "$TEST_LOG2" fi return "$test_failure" } update_leap_cli () { cd "$LEAP_SRC" git pull sudo bundle chmod 600 vendor/vagrant_ssh_keys/vagrant.key } versions () { cd "$PROVIDERDIR" [ -d .git ] && provider_head=$( git rev-parse HEAD ) if [ -z "$provider_head" ] then provider_head='not under version control' fi echo "Provider config ($PROVIDERDIR): $provider_head" echo $LEAP_CMD --version echo echo } wait_for_node() { vm=$1 ip=$( get_ip "$vm" ) online=0 echo "Waiting for ssh on VM $vm (IP: $ip) to come up..." while [ $online -eq 0 ] do ssh_up "$ip" && online=1 sleep 1 done } wait_for_nodes() { sleep 10 } config="" verbose=false print_versions=false # default in lib/leap_cli/leapfile.rb IP_PREFIX='10.5.5' if ! options=$(getopt -o vVlc:h -l lock,verbose,versions,config:,help -- "$@") then # something went wrong, getopt will put out an error message for us usage exit 1 fi eval set -- "$options" while [ $# -gt 0 ] do case $1 in -c|--config) config=$2; shift ;; -h|--help) usage; exit 1;; -l|--lock) lock=true;; -v|--verbose) verbose=true;; -V|--versions) print_versions=true;; (--) shift; break;; (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;; (*) break;; esac shift done cmd=$1 shift # source custom user config source ~/.config/leap/platform_ci/.platform-test.conf &> /dev/null || /bin/true # source --config if [ ! -z "$config" ]; then source "$config" fi # check requirements REQUIRED_ENV="LEAP_CMD TAG PROVIDERDIR LOGDIR LOCKDIR" for env in $REQUIRED_ENV; do if [ -z "${!env}" ]; then echo "Environment variable \$${!env} is required." exit 1 fi done date=$( date +"%F-%H%M%S" ) LOG_GLOBAL="$LOGDIR/deploy-$date.log" ERRLOG_GLOBAL="$LOGDIR/deploy-$date-error.log" TEST_LOG1="$LOGDIR/test.log" TEST_LOG2="$LOGDIR/test-$date.log" [ -e "$LOCKDIR" ] || mkdir -p "$LOCKDIR" LOCKFILE="${LOCKDIR}/$(basename $0).lock" FAILURE_LOCKFILE="${LOCKDIR}/failure.lock" exitcode=0 [ -e "$PROVIDERDIR" ] && cd "$PROVIDERDIR" if $print_versions then versions=$( versions ) echo "$versions" echo exit 0 fi check_for_running_instances || exit $? # set global lockfile touch "$LOCKFILE" nodes=$($LEAP_CMD list $TAG --print= | sed "s/ //g") case $cmd in add_nodes) add_nodes "$NODES";; bootstrap) bootstrap_nodes "$nodes";; create_provider) create_provider;; deploy) deploy "$nodes";; destroy_deploy) destroy_deploy "$nodes";; init_deploy) init_deploy "$nodes";; reset_deploy) reset_deploy "$nodes";; simple_deploy) simple_deploy "$nodes";; test) run_tests "$nodes";; (*) usage; echo "Please specify a command."; exit 1;; esac exitcode=$? cleanup exit $exitcode