From 3b46ddd3b336f55510fc7304eb7badbd5005350d Mon Sep 17 00:00:00 2001 From: drebs Date: Tue, 17 Oct 2017 10:49:31 -0200 Subject: [ci] improve outlier detection output --- scripts/benchmark/check-for-outliers.py | 175 ++++++++++++++++++++++ scripts/benchmark/compare-results-with-history.py | 166 -------------------- scripts/benchmark/run-benchmarks-ci-job.sh | 14 +- 3 files changed, 179 insertions(+), 176 deletions(-) create mode 100755 scripts/benchmark/check-for-outliers.py delete mode 100755 scripts/benchmark/compare-results-with-history.py diff --git a/scripts/benchmark/check-for-outliers.py b/scripts/benchmark/check-for-outliers.py new file mode 100755 index 00000000..6037ef00 --- /dev/null +++ b/scripts/benchmark/check-for-outliers.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python + +# Given a JSON file output by pytest-benchmark, this script compares the +# results of a test session with the results stored in elasticsearch. +# +# - iterate through test results in pytest-benchmark JSON file. +# +# - for each one, get mean and stddev of the mean of last 20 results from +# master branch. +# +# - compare the result in the file with the results in elastic. +# +# - if there are bad outliers, exit with status code given in command line. + +import argparse +import copy +import json +import requests +import sys + + +URL = "https://moose.leap.se:9200/benchmark/_search" +BLOCK_SIZE = 20 +MULTIPLIER = 1.5 + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + 'file', + help='The file with JSON results of pytest-benchmark') + parser.add_argument( + '--status-code', type=int, default=0, + help='The status code to exit with in case bad outliers are detected.') + return parser.parse_args() + + +def parse_file(file): + data = None + tests = [] + with open(file) as f: + data = json.loads(f.read()) + for test in data['benchmarks']: + name = test['name'] + mean = test['stats']['mean'] + extra = test['extra_info'] + tests.append((name, mean, extra)) + return tests + + +base_query = { + "query": { + "bool": { + "must": [ + {"term": {"machine_info.host": "weasel"}}, + {"term": {"commit_info.branch": "master"}}, + {"term": {"commit_info.project": "soledad"}}, + {"exists": {"field": "extra_info"}}, + {"exists": {"field": "extra_info.cpu_percent"}} + ], + "must_not": [ + ], + }, + }, + "aggs": { + "commit_id_time": { + "terms": { + "field": "commit_info.id", + "size": BLOCK_SIZE, + "order": {"commit_info_time": "desc"}, + }, + "aggs": { + "commit_info_time": {"max": {"field": "commit_info.time"}}, + } + } + }, +} + + +def get_time_cpu_stats(test): + query = copy.deepcopy(base_query) + query['query']['bool']['must'].append({ + 'term': {'name': test}}) + query['query']['bool']['must_not'].append( + {'exists': {'field': "extra_info.memory_percent"}}) + query['aggs']['commit_id_time']['aggs']['time'] = \ + {"stats": {"field": "stats.mean"}} + query['aggs']['commit_id_time']['aggs']['cpu'] = \ + {"stats": {"field": "extra_info.cpu_percent"}} + response = requests.get("%s?size=0" % URL, data=json.dumps(query)) + data = response.json() + time = [] + cpu = [] + buckets = data['aggregations']['commit_id_time']['buckets'] + for bucket in buckets: + time.append(bucket['time']['avg']) + cpu.append(bucket['cpu']['avg']) + return time, cpu + + +def get_mem_stats(test): + query = copy.deepcopy(base_query) + query['query']['bool']['must'].append({ + 'term': {'name': test}}) + query['query']['bool']['must'].append( + {'exists': {'field': "extra_info.memory_percent"}}) + query['aggs']['commit_id_time']['aggs']['mem'] = \ + {"stats": {"field": "extra_info.memory_percent.stats.max"}} + response = requests.get("%s?size=0" % URL, data=json.dumps(query)) + data = response.json() + mem = [] + buckets = data['aggregations']['commit_id_time']['buckets'] + for bucket in buckets: + mem.append(bucket['mem']['avg']) + return mem + + +def _mean(l): + return float(sum(l)) / len(l) + + +def _std(l): + if len(l) <= 1: + return 0 + mean = _mean(l) + squares = [(x - mean) ** 2 for x in l] + return (sum(squares) / (len(l) - 1)) ** 0.5 + + +def detect_bad_outlier(test, mean, extra): + bad = False + if 'memory_percent' in extra: + mem = get_mem_stats(test) + value = extra['memory_percent']['stats']['max'] + bad |= _detect_outlier(test, 'mem', value, mem) > 0 + else: + time, cpu = get_time_cpu_stats(test) + + value = mean + bad |= _detect_outlier(test, 'time', value, time) > 0 + + value = extra['cpu_percent'] + bad |= _detect_outlier(test, 'cpu', value, cpu) > 0 + return bad + + +def _detect_outlier(test, name, value, list): + mean = _mean(list) + std = _std(list) + result = 0 + print "Checking %s (%s):" % (test, name) + print " value: %f" % (value,) + print " lower limit: %f" % (mean - (MULTIPLIER * std)) + print " upper limit: %f" % (mean + (MULTIPLIER * std)) + if value < mean - MULTIPLIER * std: + print " => good outlier detected!" + result = -1 + elif value > mean + MULTIPLIER * std: + print " => bad outlier detected!" + result = 1 + return result + + +if __name__ == '__main__': + args = parse_args() + tests = parse_file(args.file) + print "Checking %d test results for outliers..." % len(tests) + failed = False + for test, mean, extra in tests: + failed |= detect_bad_outlier(test, mean, extra) + if failed: + print "Tests have bad outliers! o_O" + sys.exit(args.status_code) + else: + print "All good, no outliers were detected. :-)" diff --git a/scripts/benchmark/compare-results-with-history.py b/scripts/benchmark/compare-results-with-history.py deleted file mode 100755 index ed609552..00000000 --- a/scripts/benchmark/compare-results-with-history.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python - -# Given a JSON file output by pytest-benchmark, this script compares the -# results of a test session with the results stored in elasticsearch. -# -# - iterate through test results in pytest-benchmark JSON file. -# -# - for each one, get mean and stddev of the mean of last 20 results from -# master branch. -# -# - compare the result in the file with the results in elastic. - -import argparse -import copy -import json -import requests -import sys - - -URL = "https://moose.leap.se:9200/benchmark/_search" -BLOCK_SIZE = 20 -MULTIPLIER = 1.5 - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - 'file', - help='The file with JSON results of pytest-benchmark') - return parser.parse_args() - - -def parse_file(file): - data = None - tests = [] - with open(file) as f: - data = json.loads(f.read()) - for test in data['benchmarks']: - name = test['name'] - mean = test['stats']['mean'] - extra = test['extra_info'] - tests.append((name, mean, extra)) - return tests - - -base_query = { - "query": { - "bool": { - "must": [ - {"term": {"machine_info.host": "weasel"}}, - {"term": {"commit_info.branch": "master"}}, - {"term": {"commit_info.project": "soledad"}}, - {"exists": {"field": "extra_info"}}, - {"exists": {"field": "extra_info.cpu_percent"}} - ], - "must_not": [ - ], - }, - }, - "aggs": { - "commit_id_time": { - "terms": { - "field": "commit_info.id", - "size": BLOCK_SIZE, - "order": {"commit_info_time": "desc"}, - }, - "aggs": { - "commit_info_time": {"max": {"field": "commit_info.time"}}, - } - } - }, -} - - -def get_time_cpu_stats(test): - query = copy.deepcopy(base_query) - query['query']['bool']['must'].append({ - 'term': {'name': test}}) - query['query']['bool']['must_not'].append( - {'exists': {'field': "extra_info.memory_percent"}}) - query['aggs']['commit_id_time']['aggs']['time'] = \ - {"stats": {"field": "stats.mean"}} - query['aggs']['commit_id_time']['aggs']['cpu'] = \ - {"stats": {"field": "extra_info.cpu_percent"}} - response = requests.get("%s?size=0" % URL, data=json.dumps(query)) - data = response.json() - time = [] - cpu = [] - buckets = data['aggregations']['commit_id_time']['buckets'] - for bucket in buckets: - time.append(bucket['time']['avg']) - cpu.append(bucket['cpu']['avg']) - return time, cpu - - -def get_mem_stats(test): - query = copy.deepcopy(base_query) - query['query']['bool']['must'].append({ - 'term': {'name': test}}) - query['query']['bool']['must'].append( - {'exists': {'field': "extra_info.memory_percent"}}) - query['aggs']['commit_id_time']['aggs']['mem'] = \ - {"stats": {"field": "extra_info.memory_percent.stats.max"}} - response = requests.get("%s?size=0" % URL, data=json.dumps(query)) - data = response.json() - mem = [] - buckets = data['aggregations']['commit_id_time']['buckets'] - for bucket in buckets: - mem.append(bucket['mem']['avg']) - return mem - - -def _mean(l): - return float(sum(l)) / len(l) - - -def _std(l): - if len(l) <= 1: - return 0 - mean = _mean(l) - squares = [(x - mean) ** 2 for x in l] - return (sum(squares) / (len(l) - 1)) ** 0.5 - - -def detect_bad_outlier(test, mean, extra): - bad = False - if 'memory_percent' in extra: - mem = get_mem_stats(test) - value = extra['memory_percent']['stats']['max'] - bad |= _detect_outlier('mem', value, mem) > 0 - else: - time, cpu = get_time_cpu_stats(test) - - value = mean - bad |= _detect_outlier('time', value, time) > 0 - - value = extra['cpu_percent'] - bad |= _detect_outlier('cpu', value, cpu) > 0 - return bad - - -def _detect_outlier(name, value, list): - mean = _mean(list) - std = _std(list) - result = 0 - print "%s: %f ? %f +- %f * %f" \ - % (name, value, mean, MULTIPLIER, std) - if value < mean - MULTIPLIER * std: - print "%s: %f < %f - %f * %f" \ - % (name, value, mean, MULTIPLIER, std) - result = -1 - elif value > mean + MULTIPLIER * std: - print "%s: %f > %f - %f * %f" \ - % (name, value, mean, MULTIPLIER, std) - result = 1 - return result - - -if __name__ == '__main__': - args = parse_args() - tests = parse_file(args.file) - failed = False - for test, mean, extra in tests: - failed |= detect_bad_outlier(test, mean, extra) - if failed: - sys.exit(1) diff --git a/scripts/benchmark/run-benchmarks-ci-job.sh b/scripts/benchmark/run-benchmarks-ci-job.sh index 30c6ecf5..adf37b7a 100755 --- a/scripts/benchmark/run-benchmarks-ci-job.sh +++ b/scripts/benchmark/run-benchmarks-ci-job.sh @@ -16,8 +16,8 @@ # Environment Variables # --------------------- # -# RUN_BENCHMARKS: If not set, skip this run. -# CHECK_FOR_OUTLIERS: If set, check if results are outliers. +# RUN_BENCHMARKS - If not set, skip this run. +# STATUS_CODE_IF_OUTLIERS - Exit with this status code if outliers are detected. set -eu set -o xtrace @@ -55,12 +55,6 @@ if [ -z "$(echo ${ENVIRONMENT} | grep ^benchmark-)" ]; then exit 0 fi -# stop here unless the CHECK_FOR_OUTLIERS environment variable is set -if [ -z "${CHECK_FOR_OUTLIERS:-}" ]; then - exit 0 -fi - -# fail test for bad outliers -echo "Comparing current test results with history..." +# check for bad outliers basedir=$(dirname "${0}") -${basedir}/compare-results-with-history.py ${tempfile} +${basedir}/check-for-outliers.py --status-code ${STATUS_CODE_IF_OUTLIERS:-0} ${tempfile} -- cgit v1.2.3