1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
#
# Copyright (c) 2015 ThoughtWorks, Inc.
#
# Pixelated is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pixelated is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Pixelated. If not, see <http://www.gnu.org/licenses/>.
from random import Random
NEW_PARAGRAPH = '\n\n'
class MarkovGenerator(object):
def __init__(self, texts, random=None, add_paragraph_on_empty_chain=False):
self._markov_chain = {}
self._random = random if random else Random()
self._add_paragraph_on_empty_chain = add_paragraph_on_empty_chain
for text in filter(lambda _: _ is not None, texts):
self._extend_chain_with(text)
def add(self, text):
self._extend_chain_with(text)
@staticmethod
def _triplet_generator(words):
if len(words) < 3:
raise ValueError('Expected input with at least three words')
for i in xrange(len(words) - 2):
yield ((words[i], words[i + 1]), words[i + 2])
def _extend_chain_with(self, input_text):
words = input_text.split()
gen = self._triplet_generator(words)
for key, value in gen:
if key in self._markov_chain:
self._markov_chain[key].add(value)
else:
self._markov_chain[key] = {value}
def _generate_chain(self, length):
seed_pair = self._find_good_seed()
word, next_word = seed_pair
new_seed = False
for i in xrange(length):
yield word
if new_seed:
word, next_word = self._find_good_seed()
if self._add_paragraph_on_empty_chain:
yield NEW_PARAGRAPH
new_seed = False
else:
prev_word, word = word, next_word
try:
next_word = self._random_next_word(prev_word, word)
except KeyError:
new_seed = True
def _random_next_word(self, prev_word, word):
return self._random.choice(list(self._markov_chain[(prev_word, word)]))
def _find_good_seed(self):
max_tries = len(self._markov_chain.keys())
try_count = 0
seed_pair = self._random.choice(self._markov_chain.keys())
while not seed_pair[0][0].isupper() and try_count <= max_tries:
seed_pair = self._random.choice(self._markov_chain.keys())
try_count += 1
if try_count > max_tries:
raise ValueError('Not able find start word with captial letter')
return seed_pair
def generate(self, length):
if len(self._markov_chain.keys()) == 0:
raise ValueError('Expected at least three words input')
return ' '.join(self._generate_chain(length))
|