summaryrefslogtreecommitdiff
path: root/scripts/legacy-vs-blobs/legacy-vs-blobs.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/legacy-vs-blobs/legacy-vs-blobs.py')
-rwxr-xr-xscripts/legacy-vs-blobs/legacy-vs-blobs.py124
1 files changed, 124 insertions, 0 deletions
diff --git a/scripts/legacy-vs-blobs/legacy-vs-blobs.py b/scripts/legacy-vs-blobs/legacy-vs-blobs.py
new file mode 100755
index 00000000..c10ac269
--- /dev/null
+++ b/scripts/legacy-vs-blobs/legacy-vs-blobs.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+
+# Plot bars comparing different implementations of mail pipeline.
+#
+# This script can be improved to account for arbitrary number of data sets, but
+# it is not doing it right now.
+
+import json
+import numpy as np
+import matplotlib.pyplot as plt
+
+# make a prettier graph
+from mpltools import style
+style.use('ggplot')
+
+OUTPUT_FILENAME = 'legacy-vs-blobs.png'
+
+# each value below will generate one bar for each for each (amount, size) pair.
+# The script expects to find files in ./data/SET/ for each set of
+# implementations.
+#
+# The baseline values will be the legacy results in ./data/no-cache/.
+
+graphs = [
+ '1_10000k',
+ '10_1000k',
+ '100_100k',
+ '1000_10k',
+]
+
+
+# the JSON structure returned by the following function is ugly, but the
+# original JSONs are even uglier, so this is here just to make the life of the
+# script easier.
+#
+# We want to have something like:
+#
+# data[variation][graph][implementation] = <stats>
+#
+# Where:
+#
+# - variation is one data set under ./data (i.e. no-cache, cache, persistent,
+# etc).
+# - graph is one of the values in graphs variable above.
+# - implementation is either legacy or blobs (we just need legacy for the
+# no-cache variation, as that is the one we are using as baseline.
+
+def get_data():
+ folders = ['cache', 'no-cache', 'persistent']
+ data = {}
+ for folder in folders:
+ data[folder] = {}
+ for graph in graphs:
+ with open('data/%s/%s.json' % (folder, graph)) as f:
+ d = json.loads(f.read())
+ benchmarks = d['benchmarks']
+ data[folder][graph] = {}
+ for t in ['blobs', 'legacy']:
+ result = filter(lambda b: t in b['name'], benchmarks)
+ if result:
+ result = result.pop()
+ data[folder][graph][t] = result['stats']
+ return data
+
+
+def plot_data(data):
+
+ N = 4
+
+ # this is our baseline (i.e. legacy / legacy)
+ absolutes = (1, 1, 1, 1)
+
+ ind = np.arange(N) # the x locations for the groups
+ width = 0.20 # the width of the bars
+
+ fig, ax = plt.subplots()
+ rects1 = ax.bar(ind, absolutes, width)
+
+ # for each graph, calculate the ratios
+ ratios = {'no-cache': [], 'cache': [], 'persistent': []}
+ for graph in graphs:
+ legacy = data['no-cache'][graph]['legacy']['mean']
+
+ # calculate ratios for no-cache / legacy
+ ratio = data['no-cache'][graph]['blobs']['mean'] / legacy
+ ratios['no-cache'].append(ratio)
+
+ # calculate ratios for cache / legacy
+ ratio = data['cache'][graph]['blobs']['mean'] / legacy
+ ratios['cache'].append(ratio)
+
+ # calculate ratios for persistent / legacy
+ ratio = data['persistent'][graph]['blobs']['mean'] / legacy
+ ratios['persistent'].append(ratio)
+
+ # create the boxes with the ratios
+ nocache = tuple(ratios['no-cache'])
+ rects2 = ax.bar(ind + width, nocache, width)
+
+ cache = tuple(ratios['cache'])
+ rects3 = ax.bar(ind + (2 * width), cache, width)
+
+ persistent = tuple(ratios['persistent'])
+ rects4 = ax.bar(ind + (3 * width), persistent, width)
+
+ # add some text for labels, title and axes ticks
+ ax.set_ylabel('Normalized execution time')
+ ax.set_title('Legacy vs Blobs mail pipeline')
+ ax.set_xticks(ind + (1.5 * width))
+ ax.set_xticklabels(tuple(map(lambda name: name.replace('_', ' '), graphs)))
+
+ ax.legend(
+ (rects1[0], rects2[0], rects3[0], rects4[0]),
+ ('legacy', 'blobs', 'blobs + session cache',
+ 'blobs + session cache + persistent http'))
+ # ax.grid()
+
+ plt.savefig(OUTPUT_FILENAME)
+ # plt.show()
+
+
+if __name__ == '__main__':
+ data = get_data()
+ plot_data(data)