diff options
Diffstat (limited to 'scripts/legacy-vs-blobs/legacy-vs-blobs.py')
-rwxr-xr-x | scripts/legacy-vs-blobs/legacy-vs-blobs.py | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/scripts/legacy-vs-blobs/legacy-vs-blobs.py b/scripts/legacy-vs-blobs/legacy-vs-blobs.py new file mode 100755 index 00000000..c10ac269 --- /dev/null +++ b/scripts/legacy-vs-blobs/legacy-vs-blobs.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python + +# Plot bars comparing different implementations of mail pipeline. +# +# This script can be improved to account for arbitrary number of data sets, but +# it is not doing it right now. + +import json +import numpy as np +import matplotlib.pyplot as plt + +# make a prettier graph +from mpltools import style +style.use('ggplot') + +OUTPUT_FILENAME = 'legacy-vs-blobs.png' + +# each value below will generate one bar for each for each (amount, size) pair. +# The script expects to find files in ./data/SET/ for each set of +# implementations. +# +# The baseline values will be the legacy results in ./data/no-cache/. + +graphs = [ + '1_10000k', + '10_1000k', + '100_100k', + '1000_10k', +] + + +# the JSON structure returned by the following function is ugly, but the +# original JSONs are even uglier, so this is here just to make the life of the +# script easier. +# +# We want to have something like: +# +# data[variation][graph][implementation] = <stats> +# +# Where: +# +# - variation is one data set under ./data (i.e. no-cache, cache, persistent, +# etc). +# - graph is one of the values in graphs variable above. +# - implementation is either legacy or blobs (we just need legacy for the +# no-cache variation, as that is the one we are using as baseline. + +def get_data(): + folders = ['cache', 'no-cache', 'persistent'] + data = {} + for folder in folders: + data[folder] = {} + for graph in graphs: + with open('data/%s/%s.json' % (folder, graph)) as f: + d = json.loads(f.read()) + benchmarks = d['benchmarks'] + data[folder][graph] = {} + for t in ['blobs', 'legacy']: + result = filter(lambda b: t in b['name'], benchmarks) + if result: + result = result.pop() + data[folder][graph][t] = result['stats'] + return data + + +def plot_data(data): + + N = 4 + + # this is our baseline (i.e. legacy / legacy) + absolutes = (1, 1, 1, 1) + + ind = np.arange(N) # the x locations for the groups + width = 0.20 # the width of the bars + + fig, ax = plt.subplots() + rects1 = ax.bar(ind, absolutes, width) + + # for each graph, calculate the ratios + ratios = {'no-cache': [], 'cache': [], 'persistent': []} + for graph in graphs: + legacy = data['no-cache'][graph]['legacy']['mean'] + + # calculate ratios for no-cache / legacy + ratio = data['no-cache'][graph]['blobs']['mean'] / legacy + ratios['no-cache'].append(ratio) + + # calculate ratios for cache / legacy + ratio = data['cache'][graph]['blobs']['mean'] / legacy + ratios['cache'].append(ratio) + + # calculate ratios for persistent / legacy + ratio = data['persistent'][graph]['blobs']['mean'] / legacy + ratios['persistent'].append(ratio) + + # create the boxes with the ratios + nocache = tuple(ratios['no-cache']) + rects2 = ax.bar(ind + width, nocache, width) + + cache = tuple(ratios['cache']) + rects3 = ax.bar(ind + (2 * width), cache, width) + + persistent = tuple(ratios['persistent']) + rects4 = ax.bar(ind + (3 * width), persistent, width) + + # add some text for labels, title and axes ticks + ax.set_ylabel('Normalized execution time') + ax.set_title('Legacy vs Blobs mail pipeline') + ax.set_xticks(ind + (1.5 * width)) + ax.set_xticklabels(tuple(map(lambda name: name.replace('_', ' '), graphs))) + + ax.legend( + (rects1[0], rects2[0], rects3[0], rects4[0]), + ('legacy', 'blobs', 'blobs + session cache', + 'blobs + session cache + persistent http')) + # ax.grid() + + plt.savefig(OUTPUT_FILENAME) + # plt.show() + + +if __name__ == '__main__': + data = get_data() + plot_data(data) |