summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordrebs <drebs@leap.se>2017-07-13 15:10:10 -0300
committerdrebs <drebs@leap.se>2017-07-13 15:10:10 -0300
commitd1abf301c75548938009a650ec953920ac9bdd1e (patch)
treea8e6d07a727dd764bd4517803d2f3e3dcecac3bd
parent256182a91414b8a57e9614f65409fc06d62b39d1 (diff)
[docs] add code example for plotting elasticsearch with python
-rw-r--r--docs/misc.rst1
-rw-r--r--docs/misc/benchmarks-python-graphs.rst148
2 files changed, 149 insertions, 0 deletions
diff --git a/docs/misc.rst b/docs/misc.rst
index 9f585242..97d9ad73 100644
--- a/docs/misc.rst
+++ b/docs/misc.rst
@@ -8,3 +8,4 @@ relevant information about the development process.
:maxdepth: 2
misc/benchmarks-website.rst
+ misc/benchmarks-python-graphs.rst
diff --git a/docs/misc/benchmarks-python-graphs.rst b/docs/misc/benchmarks-python-graphs.rst
new file mode 100644
index 00000000..4d1db339
--- /dev/null
+++ b/docs/misc/benchmarks-python-graphs.rst
@@ -0,0 +1,148 @@
+Benchmarks python graphs
+========================
+
+This page documents an example of how to query elasticsearch and plot the
+results with python. We are currently using ``kibana`` to plot and show graphs,
+but in the future we might want/need the flexibility of python for that.
+
+Some notes about the code example:
+
+* Depends on ``elasticsearch`` for querying and ``matplotlib`` for plotting.
+* Searches need to be scrolled to get all results using a ``scroll_id``.
+* Commit datetime ranges (`1 <https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html>`_, `2 <https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#date-math>`_, `3 <https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-daterange-aggregation.html#date-format-pattern>`_) can be put in place to limit results.
+
+
+Code example
+------------
+
+.. code:: python
+
+ #!/usr/bin/env python
+
+ import os
+
+ from math import sqrt
+ from dateutil.parser import parse
+
+ from elasticsearch import Elasticsearch
+ from elasticsearch.exceptions import NotFoundError
+
+ import matplotlib.pyplot as plt
+
+
+ def sort_commits(a, b):
+ ta = parse(a['_source']['commit_info']['author_time'])
+ tb = parse(b['_source']['commit_info']['author_time'])
+ return -1 if ta < tb else 1
+
+
+ def _query_elasticsearch():
+ http_auth = os.environ.get('ES_CREDS').split(':')
+
+ es = Elasticsearch([{
+ 'host': 'moose.leap.se',
+ 'port': 9200,
+ 'use_ssl': True,
+ 'http_auth': http_auth,
+ }])
+
+ q = "commit_info.project:soledad " \
+ "AND machine_info.host='weasel' " \
+ "AND name='test_async_create_1000_10k' " \
+ "AND commit_info.time:[\"now-1d\" TO \"now\"]"
+
+ res = es.search(index='benchmark', q=q, scroll='1m', size=50)
+
+ total = res['hits']['total']
+ scroll_size = total
+ scroll_id = res['_scroll_id']
+ hits = res['hits']['hits'][:]
+
+ print("There are %d hits to get." % total)
+ print("(got %d...)" % len(hits))
+
+ # scroll to get all results
+ print("(started scrolling...)")
+ while scroll_size > 0:
+ try:
+ res = es.scroll(scroll_id=scroll_id, scroll='1m')
+ scroll_size = len(res['hits']['hits'])
+ scroll_id = res['_scroll_id']
+ print("(got %d more...)" % scroll_size)
+ hits += res['hits']['hits'][:]
+ except NotFoundError:
+ print("(finished scrolling.)")
+ pass
+ break
+
+ print("Found %d hits." % len(hits))
+ hits.sort(sort_commits)
+
+ stats = []
+ means = []
+
+ for hit in hits:
+ st = hit['_source']['stats']
+ commit_id = hit['_source']['commit_info']['id'][:7]
+
+ item = {}
+
+ item["label"] = commit_id
+ item["mean"] = st['mean']
+ item["med"] = st['median']
+ item["q1"] = st['q1']
+ item["q3"] = st['q3']
+ # item["cilo"] = 5.3 # not required
+ # item["cihi"] = 5.7 # not required
+ item["whislo"] = st['mean'] - st['stddev']
+ item["whishi"] = st['mean'] + st['stddev']
+ item["fliers"] = [] # required if showfliers=True
+
+ stats.append(item)
+ means.append(st['mean'])
+
+ # print(hit['_source']['commit_info'])
+
+ return stats, means
+
+
+ def mean(lst):
+ return sum(lst) / len(lst)
+
+
+ def stddev(lst):
+ mn = mean(lst)
+ variance = sum([(e - mn)**2 for e in lst]) / len(lst)
+ return sqrt(variance)
+
+
+ def _plot_graph(results):
+ print("Plotting graph...")
+ stats, means = results
+ fig, axes = plt.subplots(1, 1)
+ plt.grid()
+ axes.bxp(stats)
+
+ mmean = mean(means)
+ mstddev = stddev(means)
+ plt.axhline(y=mmean + (1.5 * mstddev), color='b', linestyle='--')
+ plt.axhline(y=mmean - (1.5 * mstddev), color='b', linestyle='--')
+ plt.axhline(y=mmean, color='b', linestyle='-')
+
+ axes.set_title('Time taken for test_async_create_1000_10k')
+
+ plt.xticks(rotation=45, ha='right', size='small')
+ plt.ylim(ymin=0)
+ plt.tight_layout()
+
+ # boxplot
+ filename = '/tmp/test.png'
+ print("Saving to %s" % filename)
+ plt.savefig(filename)
+ # plt.figure()
+ # plt.show()
+
+
+ if __name__ == '__main__':
+ results = _query_elasticsearch()
+ _plot_graph(results)