[docs] add code example for plotting elasticsearch with python

author: drebs <drebs@leap.se> 2017-07-13 15:10:10 -0300
committer: drebs <drebs@leap.se> 2017-07-13 15:10:10 -0300
commit: d1abf301c75548938009a650ec953920ac9bdd1e (patch)
tree: a8e6d07a727dd764bd4517803d2f3e3dcecac3bd
parent: 256182a91414b8a57e9614f65409fc06d62b39d1 (diff)
2 files changed, 149 insertions, 0 deletions
diff --git a/docs/misc.rst b/docs/misc.rst
index 9f585242..97d9ad73 100644
--- a/docs/misc.rst
+++ b/docs/misc.rst
@@ -8,3 +8,4 @@ relevant information about the development process.
    :maxdepth: 2
 
    misc/benchmarks-website.rst
+   misc/benchmarks-python-graphs.rst
diff --git a/docs/misc/benchmarks-python-graphs.rst b/docs/misc/benchmarks-python-graphs.rst
new file mode 100644
index 00000000..4d1db339
--- /dev/null
+++ b/docs/misc/benchmarks-python-graphs.rst
@@ -0,0 +1,148 @@
+Benchmarks python graphs
+========================
+
+This page documents an example of how to query elasticsearch and plot the
+results with python. We are currently using ``kibana`` to plot and show graphs,
+but in the future we might want/need the flexibility of python for that.
+
+Some notes about the code example:
+
+* Depends on ``elasticsearch`` for querying and ``matplotlib`` for plotting.
+* Searches need to be scrolled to get all results using a ``scroll_id``.
+* Commit datetime ranges (`1 <https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html>`_, `2 <https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#date-math>`_, `3 <https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-daterange-aggregation.html#date-format-pattern>`_) can be put in place to limit results.
+
+
+Code example
+------------
+
+.. code:: python
+
+    #!/usr/bin/env python
+
+    import os
+
+    from math import sqrt
+    from dateutil.parser import parse
+
+    from elasticsearch import Elasticsearch
+    from elasticsearch.exceptions import NotFoundError
+
+    import matplotlib.pyplot as plt
+
+
+    def sort_commits(a, b):
+        ta = parse(a['_source']['commit_info']['author_time'])
+        tb = parse(b['_source']['commit_info']['author_time'])
+        return -1 if ta < tb else 1
+
+
+    def _query_elasticsearch():
+        http_auth = os.environ.get('ES_CREDS').split(':')
+
+        es = Elasticsearch([{
+            'host': 'moose.leap.se',
+            'port': 9200,
+            'use_ssl': True,
+            'http_auth': http_auth,
+        }])
+
+        q = "commit_info.project:soledad " \
+            "AND machine_info.host='weasel' " \
+            "AND name='test_async_create_1000_10k' " \
+            "AND commit_info.time:[\"now-1d\" TO \"now\"]"
+
+        res = es.search(index='benchmark', q=q, scroll='1m', size=50)
+
+        total = res['hits']['total']
+        scroll_size = total
+        scroll_id = res['_scroll_id']
+        hits = res['hits']['hits'][:]
+
+        print("There are %d hits to get." % total)
+        print("(got %d...)" % len(hits))
+
+        # scroll to get all results
+        print("(started scrolling...)")
+        while scroll_size > 0:
+            try:
+                res = es.scroll(scroll_id=scroll_id, scroll='1m')
+                scroll_size = len(res['hits']['hits'])
+                scroll_id = res['_scroll_id']
+                print("(got %d more...)" % scroll_size)
+                hits += res['hits']['hits'][:]
+            except NotFoundError:
+                print("(finished scrolling.)")
+                pass
+                break
+
+        print("Found %d hits." % len(hits))
+        hits.sort(sort_commits)
+
+        stats = []
+        means = []
+
+        for hit in hits:
+            st = hit['_source']['stats']
+            commit_id = hit['_source']['commit_info']['id'][:7]
+
+            item = {}
+
+            item["label"] = commit_id
+            item["mean"] = st['mean']
+            item["med"] = st['median']
+            item["q1"] = st['q1']
+            item["q3"] = st['q3']
+            # item["cilo"] = 5.3 # not required
+            # item["cihi"] = 5.7 # not required
+            item["whislo"] = st['mean'] - st['stddev']
+            item["whishi"] = st['mean'] + st['stddev']
+            item["fliers"] = []  # required if showfliers=True
+
+            stats.append(item)
+            means.append(st['mean'])
+
+            # print(hit['_source']['commit_info'])
+
+        return stats, means
+
+
+    def mean(lst):
+        return sum(lst) / len(lst)
+
+
+    def stddev(lst):
+        mn = mean(lst)
+        variance = sum([(e - mn)**2 for e in lst]) / len(lst)
+        return sqrt(variance)
+
+
+    def _plot_graph(results):
+        print("Plotting graph...")
+        stats, means = results
+        fig, axes = plt.subplots(1, 1)
+        plt.grid()
+        axes.bxp(stats)
+
+        mmean = mean(means)
+        mstddev = stddev(means)
+        plt.axhline(y=mmean + (1.5 * mstddev), color='b', linestyle='--')
+        plt.axhline(y=mmean - (1.5 * mstddev), color='b', linestyle='--')
+        plt.axhline(y=mmean, color='b', linestyle='-')
+
+        axes.set_title('Time taken for test_async_create_1000_10k')
+
+        plt.xticks(rotation=45, ha='right', size='small')
+        plt.ylim(ymin=0)
+        plt.tight_layout()
+
+        # boxplot
+        filename = '/tmp/test.png'
+        print("Saving to %s" % filename)
+        plt.savefig(filename)
+        # plt.figure()
+        # plt.show()
+
+
+    if __name__ == '__main__':
+        results = _query_elasticsearch()
+        _plot_graph(results)
author	drebs <drebs@leap.se>	2017-07-13 15:10:10 -0300
committer	drebs <drebs@leap.se>	2017-07-13 15:10:10 -0300
commit	d1abf301c75548938009a650ec953920ac9bdd1e (patch)
tree	a8e6d07a727dd764bd4517803d2f3e3dcecac3bd
parent	256182a91414b8a57e9614f65409fc06d62b39d1 (diff)