#!/bin/bash
#


# in case the script gets canceled, use a trap to finally remove the 
# lockfile that indicates a running process

trap "cleanup" INT TERM


usage()
{
    cat << EOF

usage: $0 [options] command [arguments...]

This script runs the leap platform deploy tests

OPTIONS

  -a|--all                 run command on all nodes
  -c|--config file         specify config file
  -h|--help                show help
  -l|--lock                refuse to deploy if lockfile from previous failures exists
  -V|--versions            show versions/git revision of leap_cli and leap_platoform in provider dir

COMMANDS
  bootstrap      <node(s)> bootstrap node(s):
                             - leap local start
                             - leap node int
                             - sets up hostname and runs apt-get dist-upgrade
                             - leap local save
  create_provider          creates a provider instance
  deploy         <node(s)> deploy node(s)
  init_deploy    <node(s)> initialize node, then do a deploy
  destroy_deploy <node(s)> destroy vms, init, and deploy
  reset_deploy   <node(s)> reset and deploy node(s)
  test           <node(s)> run leap test  
EOF
}

add_nodes() {
    suffix=$IP_SUFFIX_START
    for i in "$@"
    do
        node=${i%:*}
        services=${i#*:}
        let suffix++
        ip="${IP_PREFIX}.$suffix"
        case $services in
            openvpn)
                config="openvpn.gateway_address:${IP_PREFIX}.98 openvpn.second_gateway_address:${IP_PREFIX}.99"
                ;;
            *)
                config=
                ;;
        esac

        $LEAP_CMD node add --local "$node"  ip_address:"$ip"  $config services:"$services"
    done
}

destroy_vms() {
    for vm in "$@"
    do
        $LEAP_CMD local destroy $vm
    done
}

bootstrap_nodes() {

    for vm in $@
    do
        $LEAP_CMD $OPTS local start "$vm"
        wait_for_node "$vm"
        $LEAP_CMD $OPTS node init "$vm"
        # set hostname + do dist-upgrade
        $LEAP_CMD $OPTS deploy "$vm" --tags site_apt::dist_upgrade,site_config::hosts,site_squid_deb_proxy::client

        # make sure machines are rebooted in order to be able to load kernel modules after a kernel update
        # https://leap.se/code/issues/6494
        cd ${PROVIDERDIR}/test
        vagrant reload $vm 

        $LEAP_CMD $OPTS local save "$vm"
    done
}

cleanup () {
    echo "cleaning up..."
    [ -e $LOCKFILE ] && rm $LOCKFILE
    exit
}


check_for_running_instances() {

    if [[ -f $LOCKFILE ]]
    then 
        echo "Lockfile found at $LOCKFILE - maybe other process(es) found running for $(basename "$0") - exiting. Please investigate and then remove lockfile."
        exit 1
    fi

    if [[ $lock && -f $FAILURE_LOCKFILE ]]
    then
        subj="WARNING: CI failure lockfile found for branch ${PLATFORM_BRANCH} - previous deploy tests failed !"
        msg="CI lock found, and --lock in use. This means that leap test failed on the previous run.\n Please investigate and then remove $FAILURE_LOCKFILE\n\n"
        if [ "$MAIL_TO" != '' ]; then
            echo "$subj Sending mail to $MAIL_TO:"
            sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}"
        fi
        exit 1
    fi

}

create_provider() {
    

    if [ -e "$PROVIDERDIR" ]
    then
        echo "$PROVIDERDIR" exists - exiting
        exit 1
    fi

    git clone -b "$PLATFORM_BRANCH" --recursive https://leap.se/git/leap_platform.git "$PLATFORMDIR"

    mkdir -p  "$PROVIDERDIR"
    cd  "$PROVIDERDIR"
    $LEAP_CMD $OPTS new --contacts "$CONTACTS" --domain "$DOMAIN" --name "$PROVIDER" --platform="$PLATFORMDIR" .


    # for now, we use the vagrant pubkey until https://leap.se/code/issues/2039 is solved
    $LEAP_CMD $OPTS add-user --self --ssh-pub-key="$SSHKEY"
    $LEAP_CMD $OPTS cert ca && $LEAP_CMD $OPTS cert csr

    # copy for faster testing
    #cp $ROOTDIR/dh.pem.test $PROVIDERDIR/files/ca/dh.pem
    $LEAP_CMD $OPTS cert dh
    add_nodes $NODES

    git init
    git add .
    git commit -m"finished create_provider"  
}

deploy() {

    # we need to deploy with verbose level 2, and filter out unwanted stuff
    # until puppet errors show up in verbose level 0 +1 (#1750)
    FILTER_CLI='= read|= loading|= no change| - executing| = executing| = applying| = ran git| = checking| = synching| = skipping file_path|   - cd .*; rsync -| - hiera| = created hiera/| = updated hiera/| = updated secrets.json| - cd /root/| - rolling backexecuting| - files/|\[bin,tests,puppet\] ->|] Hostname updated.| = Updating submodule puppet/modules|Warning: Permanently added.*to the list of known hosts.| = leap command v| = leap platform v| - \[.*\] ok| - \[.*\] STARTING APPLY| - \[.*\] APPLY COMPLETE'

    FILTER_PUPPET="] notice: |] No change to hostname|] Puppet apply complete \(changes made\).|] warning: Dynamic lookup|] warning: Scope\(Class|Skipping because of failed dependencies|warning: You cannot collect without storeconfigs being set|warning: default \`to_a' will be obsolete"

    if [ -n "$FILTER_COMMON" ]
    then 
        FILTER_ALL="($FILTER_CLI|$FILTER_PUPPET|$FILTER_COMMON)"
    else
        FILTER_ALL="($FILTER_CLI|$FILTER_PUPPET)"
    fi 


    for vm in "$@"
    do
        [ -e "$LOGDIR" ] || mkdir -p "$LOGDIR"
        date=$( date +"%F-%H%M%S" )
        LOG1="$LOGDIR/deploy-$vm.log"
        LOG2="$LOGDIR/deploy-$vm-$date.log"
        ERRLOG1="$LOGDIR/deploy-$vm-$date-error.log"
        ERRLOG2="$LOGDIR/deploy-error.log"
        echo "Deploying \"$vm\" on $( date )"|tee -a "$LOG1" "$LOG2"

        $LEAP_CMD $OPTS -v 2 deploy "$vm" 2>&1 | ts | tee -a "$LOG1" "$LOG2" | egrep -v "$FILTER_ALL" | tee -a "$ERRLOG1" "$ERRLOG2" > /dev/null

        # send an host-specific error mail on deploy failures
        if [ -s "$ERRLOG1" ]
        then
            touch $FAILURE_LOCKFILE
            versions=$( versions )
            subj="WARNING - \"leap deploy\" of platform $PLATFORM_BRANCH branch on \"$vm\" had errors !"
            echo "$( date ): $msg" | tee -a "$LOG1" "$LOG2" "$ERRLOG2"
            echo
            msg="Output of error log below:\n\n$( cat "$ERRLOG1" ) \n\n"
            msg="${msg}-------------------------------------------------------------------\n\n"
            msg="${msg}error log: ${ERRLOG1}\n"
            msg="${msg}comlete log: ${LOG2}\n\n"

            msg="${msg}Tested on $( date ) on \"$vm\" with following versions/git commit IDs: \n\n$versions"

            cat "$ERRLOG1" 
            
            if [ "$MAIL_TO" != '' ]; then
                echo "Sending this mail to $MAIL_TO:"
                sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}" 
            fi
        else
            echo "Deploy to $vm on $( date ) went fine."| tee -a "$LOG1" "$LOG2"
            rm "$ERRLOG1"
        fi
    done
} 

get_ip () {
    grep ip_address "$PROVIDERDIR/nodes/$1.json" |cut -f 2 -d:|sed 's/[ ",]//g'
}

ip_pingable () {
    ping -q -W10 -c1 "$1" >/dev/null 2>&1
    return $?
}

log_start() {
    echo
    echo "Starting $0 on $( date )"
}

deploy_failure_email() {
    # only send out a mail on success, because there are mails send out
    # for every hosts that has deploy errors anyway
    if [ ! -e "$FAILURE_LOCKFILE" -a -n "$MAIL_TO"  ]
    then
        versions=$( versions )
        subj="OK - \"leap deploy\" of platform $PLATFORM_BRANCH branch went fine."
        msg="Tested on $( date ) on these nodes: \"$nodes\"\nwith following versions/git commit IDs: \n\n$versions"

        echo "Sending deploy success mail to $MAIL_TO"
        sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}"
    fi
}

test_failure_email() {
    test_failure=$?

    versions=$( versions )
    msg="Tested on $( date ) on these nodes: \"$nodes\"\nwith following versions/git commit IDs: \n\n$versions"
    if [ $test_failure -eq 0 ]
    then
        subj="OK - \"leap test\" of platform $PLATFORM_BRANCH branch is all green."
    else
        subj="WARNING - \"leap test\" of platform $PLATFORM_BRANCH branch failed !!"
    fi

    echo "$subj"

    echo "Sending test mail to $MAIL_TO"
    # unfortunatly, no tls atm, fixed in sendemail 1.56-3
    # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=679911
    sendemail -f "$MAIL_FROM" -t "$MAIL_TO" -o tls=no -m "${msg}" -u "${subj}" -a "$TEST_LOG2"
}

init_deploy() {
    # init_deploy tests that a re-init of a node, then a deploy works this
    # accepts that the limited tagged deploy that happens in bootstrap_nodes()
    # is ok. The following steps happen:
    #
    #. make sure the platform and leap cli are up-to-date
    #. reset the nodes to their saved state (saved is after bootstrap_nodes has
    #  been run)
    #. run bootstrap_nodes (this will: start vm, run init, run a limited tag
    #  deploy, then reset the node)
    #. deploy the nodes
    #. run tests
    #. send email


    local nodes="$*"
    update_platform
    update_leap_cli

    cd "$PROVIDERDIR"

    log_start

    echo "Starting init_deploy for nodes $@ as background tasks on $( date )"

    for i in $nodes
    do
        $LEAP_CMD $OPTS local reset "$i"
        wait_for_node "$i"
        bootstrap_nodes "$i"

        # run cutom provisioning script, i.e. specified in ~/.leaprc
        cd test
        vagrant provision "$i"
        cd ..

        deploy "$i" &
    done

    # needed in a detached screen session, otherwise it would terminate before deploy jobs
    # have finished
    echo "Waiting until last deploy process has finished..."
    wait

    # send mail if deployment fails
    deploy_failure_email 

    # run tests
    run_tests

    # send mail if tests fail
    test_failure_email
}

reset_deploy() {
    # reset_deploy tests that a deploy works after the node has been reset to
    # the bootstrap_nodes() state it does not re-run bootstrap_nodes(), so a
    # 'leap node init' is not run again, this is usually sufficient. The
    # following steps happen:
    #
    #. make sure the platform and leap cli are up-to-date
    #. reset the nodes to their saved state (saved is after bootstrap_nodes has
    #  been run)
    #. deploy the nodes
    #. run tests
    #. send email


    local nodes="$*"
    update_platform
    update_leap_cli

    cd "$PROVIDERDIR"

    log_start

    echo "Starting reset_deploy for nodes $@ as background tasks on $( date )"

    for i in $nodes
    do
        $LEAP_CMD $OPTS local reset "$i"
        wait_for_node "$i"

        # run cutom provisioning script, i.e. specified in ~/.leaprc
        cd test
        vagrant provision "$i"
        cd ..

        deploy "$i" &
    done

    # needed in a detached screen session, otherwise it would terminate before deploy jobs
    # have finished
    echo "Waiting until last deploy process has finished..."
    wait

    # send mail if deployment fails
    deploy_failure_email

    # run tests
    run_tests

    # send mail if tests fail
    test_failure_email
}

destroy_deploy() {
    # destroy_deploy tests a full-cycle, it destroys the VMs, re-creates them
    # from scratch and then bootstraps them. The following steps happen:
    #. make sure the platform and leap cli are up-to-date
    #. destroy the vms
    #. run bootstrap_nodes (this will: start vm, run init, run a limited tag
    #  deploy, then reset the node)
    #. deploy the nodes
    #. run tests
    #. send email


    local nodes="$*"
    update_platform
    update_leap_cli

    cd "$PROVIDERDIR"

    log_start

    echo "Starting destroy_deploy for nodes $@ as background tasks on $( date )"

    destroy_vms "$@"

    for i in $nodes
    do
        bootstrap_nodes "$i"

        # run cutom provisioning script, i.e. specified in ~/.leaprc
        cd test
        vagrant provision "$i"
        cd ..

        deploy "$i" &
    done

    # needed in a detached screen session, otherwise it would terminate before deploy jobs
    # have finished
    echo "Waiting until last deploy process has finished..."
    wait

    # send mail if deployment fails
    deploy_failure_email

    # run tests
    run_tests

    # send mail if tests fail
    test_failure_email

}

ssh_up () {
    nc -w 4 "$1" 22 > /dev/null
    return $?
}

run_tests () {

    date=$( date +"%F-%H%M%S" )
    TEST_FILTER='net.ssh.authentication.agent.*could not connect to ssh-agent'


    echo -e "\nRunning leap test on $date" | tee -a "$TEST_LOG1" "$TEST_LOG2" 
    $LEAP_CMD $OPTS test --continue 2>&1 | ts | egrep -v "$TEST_FILTER" | tee -a "$TEST_LOG1" "$TEST_LOG2" 
    test_failure=${PIPESTATUS[0]}
    
    if [ "$test_failure" -ne 0 ]
    then
        echo 'WARNING - "leap test" failed !' | tee -a "$TEST_LOG1" "$TEST_LOG2"
        touch $FAILURE_LOCKFILE
    else
        echo 'OK - "leap test" is all green !' | tee -a "$TEST_LOG1" "$TEST_LOG2"
    fi
    return "$test_failure"

}

update_leap_cli () {
    cd "$LEAP_SRC"
    git pull
    sudo bundle
}


update_platform () {
    cd "$PLATFORMDIR"
    # works also with forces updates, i.e. reabased branches like citest
    # http://stackoverflow.com/questions/4550937/how-to-force-update-when-doing-git-pull/14359894#14359894

    # throw away local uncommitted changes
    git reset --hard HEAD
    # remove untracked files
    git clean -f    

    git fetch
    git checkout origin/$PLATFORM_BRANCH
    git checkout -B $PLATFORM_BRANCH

    git submodule sync
    git submodule update --init
}

versions () {

    cd "$PROVIDERDIR"
    [ -d .git ] && provider_head=$( git rev-parse HEAD )
    if [ -n "$provider_head" ]
    then
        provider_head='not under version control'
    fi  

    echo "Provider ($PROVIDERDIR): $provider_head"
    echo

    $LEAP_CMD -v 2 list | grep ' = leap command v'
    echo
    echo
    
    #echo "leap_platform:"
    $LEAP_CMD -v 2 list | grep ' = leap platform v'
    echo
    echo
}

wait_for_node() {
    vm=$1
    ip=$( get_ip "$vm" )
    online=0
    echo "Waiting for ssh on VM $vm (IP: $ip) to come up..."
    while [ $online -eq 0 ]
    do
        ssh_up "$ip" && online=1
        sleep 1
    done
}


# http://docs.vagrantup.com/v2/providers/default.html
export VAGRANT_DEFAULT_PROVIDER="libvirt"


config=""
all=false
print_versions=false
# default in lib/leap_cli/leapfile.rb
IP_PREFIX='10.5.5'

if ! options=$(getopt -o avVlc:h -l all,lock,verbose,versions,config:,help -- "$@")
then 
    # something went wrong, getopt will put out an error message for us
    usage
    exit 1
fi  

eval set -- "$options"

while [ $# -gt 0 ]
do
    case $1 in
        -a|--all)      all=true;;
        -c|--config)   config=$2; shift ;;
        -h|--help)     usage; exit 1;;
        -l|--lock)     lock=true;;
        -V|--versions)  print_versions=true;; 
        (--) shift; break;;
        (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
        (*) break;;
    esac
    shift
done


cmd=$1
shift
nodelist="$*"

#echo "config:   $config"
#echo "cmd:      $cmd"
#echo "nodelist: $nodelist"
#echo "all:      $all"

if [ -z "$config" ]
then
    usage
    echo "Please provide a config file"
    exit 1
fi

# provider specific config
source "$config"
# common config for all providers
source /etc/leap/platform-test-common.cfg

date=$( date +"%F-%H%M%S" )

TEST_LOG1="$LOGDIR/test.log"
TEST_LOG2="$LOGDIR/test-$date.log"

LOCKFILE_DIR="/var/run/lock/leap_ci/${PLATFORM_BRANCH}"
[ -e $LOCKFILE_DIR ] || mkdir -p $LOCKFILE_DIR
LOCKFILE="${LOCKFILE_DIR}/$(basename $0).lock"
FAILURE_LOCKFILE="${LOCKFILE_DIR}/failure.lock"

check_for_running_instances || exit $?

# set branch specifc lockfile so a deploy test cannot be run twice in parallel
touch $LOCKFILE


[ -e "$PROVIDERDIR" ] && cd  "$PROVIDERDIR"

if $print_versions
then
    versions=$( versions )
    echo "$versions"
    echo
    exit 0
fi

if [ -z "$LEAP_CMD" -o -z "$LEAP_SRC" ] 
then
    echo "please provide a path to the leap_cli binary and the source in the config file, using the LEAP_CMD and LEAP_SRC var."
    exit 1
fi

if $all ; then
    # use NODES variable from the config file
    nodes=$NODES

    # strip services from nodes_services
    # i.e. nodes_services='redevcouchdb1:couchdb redevcouchdb2:couchdb'
    # -> nodes='redevcouchdb1 redevcouchdb2'
    nodes=$( echo "$NODES" | sed 's/:[[:alnum:],]*//g' )
else
    # use nodelist provided via cmdline parameters
    nodes=$nodelist
fi


case $cmd in 
    add_nodes)        add_nodes "$nodes";;
    bootstrap)        bootstrap_nodes "$nodes";;
    create_provider)  create_provider;;
    deploy)           deploy "$nodes";;
    destroy_deploy)   destroy_deploy "$nodes";;
    init_deploy)      init_deploy "$nodes";;
    reset_deploy)     reset_deploy "$nodes";;
    test)             run_tests "$nodes";;
    (*)               usage; echo "Please specify a command."; exit 1;;
esac

cleanup