#!/bin/bash
#


# in case the script gets canceled, use a trap to finally remove the
# lockfile that indicates a running process
trap "cleanup" INT TERM

shopt -s expand_aliases
alias strip_colors='sed -r "s/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g"'

usage()
{
    cat << EOF

usage: $0 [options] command [arguments...]

This script runs the leap platform deploy tests

OPTIONS

  -a|--all                 run command on all nodes
  -c|--config file         specify config file
  -h|--help                show help
  -l|--lock                refuse to deploy if lockfile from previous failures exists
  -v|--verbose       show also successful puppet resource changes, not only errors
  -V|--versions            show versions/git revision of leap_cli and leap_platoform in provider dir

COMMANDS
  bootstrap      <node(s)> bootstrap node(s):
                             - leap local start
                             - leap node int
                             - sets up hostname and runs apt-get dist-upgrade
                             - leap local save
  create_provider          creates a provider instance
  deploy         <node(s)> deploy node(s)
  init_deploy    <node(s)> initialize node, then do a deploy
  destroy_deploy <node(s)> destroy vms, init, and deploy
  reset_deploy   <node(s)> reset and deploy node(s)
  test           <node(s)> run leap test
EOF
}

add_nodes() {
  suffix=$IP_SUFFIX_START
  for i in "$@"
  do
    node=${i%:*}
    services=${i#*:}
    let suffix++
    ip="${IP_PREFIX}.$suffix"

    if [[ "$services" =~ "openvpn" ]]
    then
      config="openvpn.gateway_address:${IP_PREFIX}.98 openvpn.second_gateway_address:${IP_PREFIX}.99"
    else
      config=''
    fi

    rm "nodes/${node}.json"
    $LEAP_CMD node add --local "$node" ip_address:"$ip" $config services:"$services" tags:"$TAG"
  done
}

destroy_vms() {
    for vm in "$@"
    do
        $LEAP_CMD local destroy "$vm"
    done
}

bootstrap_nodes() {

  $LEAP_CMD $OPTS local start

  for vm in $(echo $nodes)
  do
    wait_for_node $vm
    $LEAP_CMD $OPTS node init $vm

    # set hostname + do dist-upgrade
    # deploying these classes have too much dependencies,
    # this is done now by the "~/vagrant/bootstrap_only.sh" provisioner
    #$LEAP_CMD $OPTS deploy "$vm" --tags site_apt::dist_upgrade,site_config::hosts,site_squid_deb_proxy::client

  done

  # make sure machines are rebooted in order to be able to load kernel modules after a kernel update
  # https://leap.se/code/issues/6494
  cd "${PROVIDERDIR}/test"
  vagrant reload
  cd ..

  #$LEAP_CMD $OPTS local save
}

cleanup () {
    echo "cleaning up..."
    [ -e "$LOCKFILE" ] && rm "$LOCKFILE"
}


check_for_running_instances() {

    if [[ -f $LOCKFILE ]]
    then
        echo "Lockfile found at $LOCKFILE - maybe other process(es) found running for $(basename "$0") - exiting. Please investigate and then remove lockfile."
        exit 1
    fi

    if [[ -f $FAILURE_LOCKFILE ]]
    then
      if [[ $lock ]]
      then
          subj="WARNING: CI failure lockfile found for branch ${PLATFORM_BRANCH} - previous deploy tests failed !"
          msg="CI lock found, and --lock in use. This means that leap test failed on the previous run.\n Please investigate and then remove $FAILURE_LOCKFILE\n\n"
          if [ "$MAIL_TO" != '' ]; then
              echo "$subj Sending mail to $MAIL_TO:"
              sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}"
          fi
          exit 1
     else
       rm "$FAILURE_LOCKFILE"
     fi
    fi
}

create_provider() {


    if [ -e "$PROVIDERDIR" ]
    then
        echo "$PROVIDERDIR" exists - exiting
        exit 1
    fi

    git clone -b "$PLATFORM_BRANCH" --recursive https://leap.se/git/leap_platform.git "$PLATFORMDIR"

    mkdir -p  "$PROVIDERDIR"
    cd  "$PROVIDERDIR"
    $LEAP_CMD $OPTS new --contacts "$CONTACTS" --domain "$DOMAIN" --name "$PROVIDER" --platform="$PLATFORMDIR" .


    # for now, we use the vagrant pubkey until https://leap.se/code/issues/2039 is solved
    $LEAP_CMD $OPTS add-user --self --ssh-pub-key="$SSHKEY"
    $LEAP_CMD $OPTS cert ca && $LEAP_CMD $OPTS cert csr

    # copy for faster testing
    $LEAP_CMD $OPTS cert dh
    add_nodes $NODES

    git init
    git add .
    git commit -m"finished create_provider"
}

# deploys to all nodes in a given env
deploy() {

    returncode=0
    # we need to deploy with verbose level 2, and filter out unwanted stuff
    # until puppet errors show up in verbose level 0 +1 (#1750)
    FILTER_CLI=' - \[.*\] Changed /etc/hostname to| - \[.*\] Changed hostname to|= read|= loading|= no change| - executing| = executing| = applying| = ran git| = checking| = synching| = skipping file_path|   - cd .*; rsync -| - hiera| = created | = updated hiera/| = updated secrets.json| - cd /root/| - rolling backexecuting| - files/|\[bin,tests,puppet\] ->|] Hostname updated.| = Updating submodule puppet/modules|Warning: Permanently added.*to the list of known hosts.| = leap command v| = leap platform v| - \[.*\] ok| - \[.*\] STARTING APPLY| - \[.*\] APPLY COMPLETE|net.ssh.authentication.agent.*could not connect to ssh-agent|net.ssh.service.forward.*: could not establish forwarding of authentication agent|Deploying | - mx/dkim.pub, mx/dkim.key -> .*/srv/leap/files'

    FILTER_PUPPET="] notice: |^Notice: |] No change to hostname|] Puppet apply complete \(changes made\).|] completed in |] warning: Dynamic lookup|] warning: Scope\(Class|Skipping because of failed dependencies|warning: You cannot collect without storeconfigs being set|warning: Not collecting exported resources without storeconfigs|warning: default \`to_a' will be obsolete|Warning: Found multiple default providers for vcsrepo"

    if [ -n "$FILTER_COMMON" ]
    then
        FILTER_ALL="($FILTER_CLI|$FILTER_PUPPET|$FILTER_COMMON)"
    else
        FILTER_ALL="($FILTER_CLI|$FILTER_PUPPET)"
    fi

    [ -e "$LOGDIR" ] || mkdir -p "$LOGDIR"
    date=$( date +"%F-%H%M%S" )
    LOG1="$LOGDIR/$TAG.log"
    LOG2="$LOGDIR/deploy-$TAG-$date.log"
    ERRLOG1="$LOGDIR/deploy-$TAG-$date-error.log"
    ERRLOG2="$LOGDIR/deploy-error.log"
    echo "Deploying tag \"$TAG\" on $( date )"|tee -a "$LOG1" "$LOG2"

    if $verbose
    then
      $LEAP_CMD "$OPTS" -v 2 deploy "$TAG" 2>&1 | tee -a "$LOG1" "$LOG2" "$LOG_GLOBAL"
      strip_colors < "$LOG2" | egrep -iv "$FILTER_ALL" | tee -a "$ERRLOG1" "$ERRLOG2" "$ERRLOG_GLOBAL" > /dev/null
    else
      $LEAP_CMD "$OPTS" -v 2 deploy "$TAG" 2>&1 | tee -a "$LOG1" "$LOG2" "$LOG_GLOBAL" | strip_colors | egrep -iv "$FILTER_ALL" | tee -a "$ERRLOG1" "$ERRLOG2" "$ERRLOG_GLOBAL" > /dev/null
    fi

    # send an tag-specific error mail on deploy failures
    if [ -s "$ERRLOG1" ]
    then
      touch "$FAILURE_LOCKFILE"
      returncode=1
      versions=$( versions )
      subj="WARNING - \"leap deploy\" of platform $PLATFORM_BRANCH branch on tag \"$TAG\" had errors !"
      echo "$( date ): $msg" | tee -a "$LOG1" "$LOG2" "$ERRLOG2"
      echo
      msg="Output of error log below:\n\n$( cat "$ERRLOG1" ) \n\n"
      msg="${msg}-------------------------------------------------------------------\n\n"
      msg="${msg}error log: ${ERRLOG1}\n"
      msg="${msg}comlete log: ${LOG2}\n\n"

      msg="${msg}Tested on $( date ) on tag \"$TAG\" with following versions/git commit IDs: \n\n$versions"

      cat "$ERRLOG1"

      if [ "$MAIL_TO" != '' ]; then
        echo "Sending this mail to $MAIL_TO:"
        sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}"
      fi
    else
      echo "Deploy to tag $TAG on $( date ) went fine."| tee -a "$LOG1" "$LOG2"
      rm "$ERRLOG1"
    fi
    return $returncode
}

get_ip () {
    grep ip_address "$PROVIDERDIR/nodes/$1.json" |cut -f 2 -d:|sed 's/[ ",]//g'
}

ip_pingable () {
    ping -q -W10 -c1 "$1" >/dev/null 2>&1
    return $?
}

log_start() {
    echo
    echo "Starting $0 on $( date )"
}

deploy_failure_email() {
    # only send out a mail on success, because there are mails send out
    # for every hosts that has deploy errors anyway
    if [ ! -e "$FAILURE_LOCKFILE" -a -n "$MAIL_TO"  ]
    then
        versions=$( versions )
        subj="OK - \"leap deploy\" of platform $PLATFORM_BRANCH branch went fine."
        msg="Tested on $( date ) on these nodes: \"$nodes\"\nwith following versions/git commit IDs: \n\n$versions"

        echo "Sending deploy success mail to $MAIL_TO"
        sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}"
    fi
}

test_failure_email() {
    test_failure=$?

    versions=$( versions )
    msg="Tested on $( date ) on these nodes: \"$nodes\"\nwith following versions/git commit IDs: \n\n$versions"
    if [ $test_failure -eq 0 ]
    then
        subj="OK - \"leap test\" of platform $PLATFORM_BRANCH branch is all green."
    else
        subj="WARNING - \"leap test\" of platform $PLATFORM_BRANCH branch failed !!"
    fi

    echo "$subj"

    echo "Sending test mail to $MAIL_TO"
    # unfortunatly, no tls atm, fixed in sendemail 1.56-3
    # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=679911
    sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}" -a "$TEST_LOG2"
}

init_deploy() {
    # init_deploy tests that a re-init of a node, then a deploy works this
    # accepts that the limited tagged deploy that happens in bootstrap_nodes()
    # is ok. The following steps happen:
    #
    #. run bootstrap_nodes (this will: start vm, run init, provision)
    #. deploy the nodes
    #. send email


    cd "$PROVIDERDIR"

    log_start

    echo "Starting init_deploy for nodes $* as background tasks on $( date )"

    $LEAP_CMD $OPTS local start
    #$LEAP_CMD $OPTS local reset
    wait_for_nodes
    bootstrap_nodes

    # run cutom provisioning script, i.e. specified in ~/.leaprc
    cd test
    vagrant provision
    cd ..

    deploy

    if [ -e "$FAILURE_LOCKFILE" ]
    then
      print_global_errorlog
      deploy_failure_email
      returncode=1
    fi
    summary
    return $returncode

}

reset_deploy() {
    returncode=0
    # reset_deploy tests that a deploy works after the node has been reset to
    # the bootstrap_nodes() state it does not re-run bootstrap_nodes(), so a
    # 'leap node init' is not run again, this is usually sufficient. The
    # following steps happen:
    #
    #. reset the nodes to their saved state (saved is after bootstrap_nodes has
    #  been run)
    #. deploy the nodes
    #. send email if MAIL_TO is set

    cd "$PROVIDERDIR"

    log_start

    echo "Starting reset_deploy on $(date)"

    $LEAP_CMD $OPTS local start
    $LEAP_CMD $OPTS local reset
    wait_for_nodes

    # run cutom provisioning script, i.e. specified in ~/.leaprc
    cd test
    vagrant provision
    cd ..

    deploy

    if [ -e "$FAILURE_LOCKFILE" ]
    then
      print_global_errorlog
      deploy_failure_email
      returncode=1
    fi
    summary
    return $returncode
}

simple_deploy() {
    returncode=0
    # Simple deploy to an existing,initialized and optional already deployed node.
    # Following steps happen:
    #
    #. deploy the nodes
    #. send email if MAIL_TO is set

    cd "$PROVIDERDIR"

    log_start

    echo "Starting simple_deploy on $(date)"

    $LEAP_CMD $OPTS local start
    wait_for_nodes

    # run cutom provisioning script, i.e. specified in ~/.leaprc
    cd test
    vagrant provision
    cd ..

    deploy

    if [ -e "$FAILURE_LOCKFILE" ]
    then
      print_global_errorlog
      deploy_failure_email
      returncode=1
    fi
    summary
    return $returncode
}

destroy_deploy() {
    # destroy_deploy tests a full-cycle, it destroys the VMs, re-creates them
    # from scratch and then bootstraps them. The following steps happen:
    #. make sure the platform and leap cli are up-to-date
    #. destroy the vms
    #. run bootstrap_nodes (this will: start vm, run init, run a limited tag
    #  deploy, then reset the node)
    #. deploy the nodes
    #. send email


    local nodes="$*"
    update_leap_cli

    cd "$PROVIDERDIR"

    log_start

    echo "Starting destroy_deploy for nodes $* as background tasks on $(date)"

    destroy_vms "$@"

    for i in $nodes
    do
      bootstrap_nodes "$i"

      # run cutom provisioning script, i.e. specified in ~/.leaprc
      cd test
      vagrant provision "$i"
      cd ..

      deploy "$i" &
    done

    # needed in a detached screen session, otherwise it would terminate before deploy jobs
    # have finished
    echo "Waiting until last deploy process has finished..."
    wait

    if [ -e "$FAILURE_LOCKFILE" ]
    then
      print_global_errorlog
      deploy_failure_email
      returncode=1
    fi
    summary
    return $returncode
}

print_global_errorlog () {
  COLOR="\033[31m"
  PLAIN="\033[0m"
  echo -e "${COLOR}"
  echo -e "\n==================================================================\n"
  echo "Errors during deploy:"

  strip_colors < "$ERRLOG_GLOBAL"

  echo -e "\n==================================================================\n\n"

  echo -e "${PLAIN}"

}

summary () {
  echo "Complete deploy log: $LOG_GLOBAL"
  [ -f "$ERRLOG_GLOBAL" ] && echo "Error deploy log: $ERRLOG_GLOBAL"
  versions
}

ssh_up () {
    nc -w 4 "$1" 22 > /dev/null
    return $?
}

run_tests () {

    date=$( date +"%F-%H%M%S" )
    TEST_FILTER='net.ssh.authentication.agent.*could not connect to ssh-agent'


    echo -e "\nRunning leap test on $date" | tee -a "$TEST_LOG1" "$TEST_LOG2"

    if $verbose
    then
      $LEAP_CMD $OPTS test "$TAG" --continue 2>&1 | tee -a "$TEST_LOG1" "$TEST_LOG2"
      test_failure=${PIPESTATUS[0]}
      egrep -iv "$TEST_FILTER" $TEST_LOG2 | tee -a "$TEST_LOG1" "$TEST_LOG2" > /dev/null
    else
      $LEAP_CMD $OPTS test "$TAG" --continue 2>&1 | egrep -iv "$TEST_FILTER" | tee -a "$TEST_LOG1" "$TEST_LOG2"
      test_failure=${PIPESTATUS[0]}
    fi

    if [ "$test_failure" -ne 0 ]
    then
        echo 'WARNING - "leap test" failed !' | tee -a "$TEST_LOG1" "$TEST_LOG2"
        touch "$FAILURE_LOCKFILE"
    else
        echo 'OK - "leap test" is all green !' | tee -a "$TEST_LOG1" "$TEST_LOG2"
    fi
    return "$test_failure"
}

update_leap_cli () {
    cd "$LEAP_SRC"
    git pull
    sudo bundle
    chmod 600 vendor/vagrant_ssh_keys/vagrant.key
}


versions () {

    cd "$PROVIDERDIR"
    [ -d .git ] && provider_head=$( git rev-parse HEAD )
    if [ -z "$provider_head" ]
    then
        provider_head='not under version control'
    fi

    echo "Provider config ($PROVIDERDIR): $provider_head"
    echo

    $LEAP_CMD --version
    echo
    echo
}

wait_for_node() {
    vm=$1
    ip=$( get_ip "$vm" )
    online=0
    echo "Waiting for ssh on VM $vm (IP: $ip) to come up..."
    while [ $online -eq 0 ]
    do
        ssh_up "$ip" && online=1
        sleep 1
    done
}

wait_for_nodes() {
  sleep 10
}

config=""
verbose=false
print_versions=false
# default in lib/leap_cli/leapfile.rb
IP_PREFIX='10.5.5'


if ! options=$(getopt -o vVlc:h -l lock,verbose,versions,config:,help -- "$@")
then
    # something went wrong, getopt will put out an error message for us
    usage
    exit 1
fi

eval set -- "$options"

while [ $# -gt 0 ]
do
    case $1 in
        -c|--config)   config=$2; shift ;;
        -h|--help)     usage; exit 1;;
        -l|--lock)     lock=true;;
        -v|--verbose)  verbose=true;;
        -V|--versions) print_versions=true;;
        (--) shift; break;;
        (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
        (*) break;;
    esac
    shift
done

cmd=$1
shift

# source custom user config
source ~/.config/leap/platform_ci/.platform-test.conf &> /dev/null || /bin/true

# source --config
if [ ! -z "$config" ]; then
    source "$config"
fi

# check requirements
REQUIRED_ENV="LEAP_CMD TAG PROVIDERDIR LOGDIR LOCKDIR"
for env in $REQUIRED_ENV; do
    if [ -z "${!env}" ]; then
        echo "Environment variable \$${!env} is required."
        exit 1
    fi
done

date=$( date +"%F-%H%M%S" )
LOG_GLOBAL="$LOGDIR/deploy-$date.log"
ERRLOG_GLOBAL="$LOGDIR/deploy-$date-error.log"

TEST_LOG1="$LOGDIR/test.log"
TEST_LOG2="$LOGDIR/test-$date.log"

[ -e "$LOCKDIR" ] || mkdir -p "$LOCKDIR"
LOCKFILE="${LOCKDIR}/$(basename $0).lock"
FAILURE_LOCKFILE="${LOCKDIR}/failure.lock"
exitcode=0

[ -e "$PROVIDERDIR" ] && cd  "$PROVIDERDIR"

if $print_versions
then
    versions=$( versions )
    echo "$versions"
    echo
    exit 0
fi

check_for_running_instances || exit $?

# set global lockfile
touch "$LOCKFILE"

nodes=$($LEAP_CMD list $TAG --print= | sed "s/ //g")

case $cmd in
    add_nodes)        add_nodes "$NODES";;
    bootstrap)        bootstrap_nodes "$nodes";;
    create_provider)  create_provider;;
    deploy)           deploy "$nodes";;
    destroy_deploy)   destroy_deploy "$nodes";;
    init_deploy)      init_deploy "$nodes";;
    reset_deploy)     reset_deploy "$nodes";;
    simple_deploy)    simple_deploy "$nodes";;
    test)             run_tests "$nodes";;
    (*)               usage; echo "Please specify a command."; exit 1;;
esac

exitcode=$?

cleanup

exit $exitcode