scripts/legacy-vs-blobs/legacy-vs-blobs.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

#!/usr/bin/env python

# Plot bars comparing different implementations of mail pipeline.
#
# This script can be improved to account for arbitrary number of data sets, but
# it is not doing it right now.

import json
import numpy as np
import matplotlib.pyplot as plt

# make a prettier graph
from mpltools import style
style.use('ggplot')

OUTPUT_FILENAME = 'legacy-vs-blobs.png'

# each value below will generate one bar for each for each (amount, size) pair.
# The script expects to find files in ./data/SET/ for each set of
# implementations.
#
# The baseline values will be the legacy results in ./data/no-cache/.

graphs = [
    '1_10000k',
    '10_1000k',
    '100_100k',
    '1000_10k',
]


# the JSON structure returned by the following function is ugly, but the
# original JSONs are even uglier, so this is here just to make the life of the
# script easier.
#
# We want to have something like:
#
#   data[variation][graph][implementation] = <stats>
#
# Where:
#
#   - variation is one data set under ./data (i.e. no-cache, cache, persistent,
#     etc).
#   - graph is one of the values in graphs variable above.
#   - implementation is either legacy or blobs (we just need legacy for the
#     no-cache variation, as that is the one we are using as baseline.

def get_data():
    folders = ['cache', 'no-cache', 'persistent']
    data = {}
    for folder in folders:
        data[folder] = {}
        for graph in graphs:
            with open('data/%s/%s.json' % (folder, graph)) as f:
                d = json.loads(f.read())
                benchmarks = d['benchmarks']
            data[folder][graph] = {}
            for t in ['blobs', 'legacy']:
                result = filter(lambda b: t in b['name'], benchmarks)
                if result:
                    result = result.pop()
                    data[folder][graph][t] = result['stats']
    return data


def plot_data(data):

    N = 4

    # this is our baseline (i.e. legacy / legacy)
    absolutes = (1, 1, 1, 1)

    ind = np.arange(N)  # the x locations for the groups
    width = 0.20        # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, absolutes, width)

    # for each graph, calculate the ratios
    ratios = {'no-cache': [], 'cache': [], 'persistent': []}
    for graph in graphs:
        legacy = data['no-cache'][graph]['legacy']['mean']

        # calculate ratios for no-cache / legacy
        ratio = data['no-cache'][graph]['blobs']['mean'] / legacy
        ratios['no-cache'].append(ratio)

        # calculate ratios for cache / legacy
        ratio = data['cache'][graph]['blobs']['mean'] / legacy
        ratios['cache'].append(ratio)

        # calculate ratios for persistent / legacy
        ratio = data['persistent'][graph]['blobs']['mean'] / legacy
        ratios['persistent'].append(ratio)

    # create the boxes with the ratios
    nocache = tuple(ratios['no-cache'])
    rects2 = ax.bar(ind + width, nocache, width)

    cache = tuple(ratios['cache'])
    rects3 = ax.bar(ind + (2 * width), cache, width)

    persistent = tuple(ratios['persistent'])
    rects4 = ax.bar(ind + (3 * width), persistent, width)

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Normalized execution time')
    ax.set_title('Legacy vs Blobs mail pipeline')
    ax.set_xticks(ind + (1.5 * width))
    ax.set_xticklabels(tuple(map(lambda name: name.replace('_', ' '), graphs)))

    ax.legend(
        (rects1[0], rects2[0], rects3[0], rects4[0]),
        ('legacy', 'blobs', 'blobs + session cache',
         'blobs + session cache + persistent http'))
    # ax.grid()

    plt.savefig(OUTPUT_FILENAME)
    # plt.show()


if __name__ == '__main__':
    data = get_data()
    plot_data(data)