summaryrefslogtreecommitdiff
path: root/src/pixelated/support/markov.py
blob: 8f7c0ef31083eaa00430495423e69db1756566ba (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#
# Copyright (c) 2015 ThoughtWorks, Inc.
#
# Pixelated is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pixelated is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Pixelated. If not, see <http://www.gnu.org/licenses/>.

from random import Random

NEW_PARAGRAPH = '\n\n'


class MarkovGenerator(object):

    def __init__(self, texts, random=None, add_paragraph_on_empty_chain=False):
        self._markov_chain = {}
        self._random = random if random else Random()
        self._add_paragraph_on_empty_chain = add_paragraph_on_empty_chain

        for text in filter(lambda _: _ is not None, texts):
            self._extend_chain_with(text)

    def add(self, text):
        self._extend_chain_with(text)

    @staticmethod
    def _triplet_generator(words):
        if len(words) < 3:
            raise ValueError('Expected input with at least three words')

        for i in xrange(len(words) - 2):
            yield ((words[i], words[i + 1]), words[i + 2])

    def _extend_chain_with(self, input_text):
        words = input_text.split()
        gen = self._triplet_generator(words)

        for key, value in gen:
            if key in self._markov_chain:
                self._markov_chain[key].add(value)
            else:
                self._markov_chain[key] = {value}

    def _generate_chain(self, length):
        seed_pair = self._find_good_seed()
        word, next_word = seed_pair
        new_seed = False

        for i in xrange(length):
            yield word

            if new_seed:
                word, next_word = self._find_good_seed()
                if self._add_paragraph_on_empty_chain:
                    yield NEW_PARAGRAPH
                new_seed = False
            else:
                prev_word, word = word, next_word

                try:
                    next_word = self._random_next_word(prev_word, word)
                except KeyError:
                    new_seed = True

    def _random_next_word(self, prev_word, word):
        return self._random.choice(list(self._markov_chain[(prev_word, word)]))

    def _find_good_seed(self):
        max_tries = len(self._markov_chain.keys())
        try_count = 0

        seed_pair = self._random.choice(self._markov_chain.keys())
        while not seed_pair[0][0].isupper() and try_count <= max_tries:
            seed_pair = self._random.choice(self._markov_chain.keys())
            try_count += 1

        if try_count > max_tries:
            raise ValueError('Not able find start word with captial letter')

        return seed_pair

    def generate(self, length):
        if len(self._markov_chain.keys()) == 0:
            raise ValueError('Expected at least three words input')
        return ' '.join(self._generate_chain(length))