diff options
author | Folker Bernitt <fbernitt@thoughtworks.com> | 2015-10-28 12:05:59 +0100 |
---|---|---|
committer | Folker Bernitt <fbernitt@thoughtworks.com> | 2015-10-28 12:08:41 +0100 |
commit | 70c770635199cfa473608162ec7d31e030a11c5f (patch) | |
tree | 5bb5d9ff72a6edabf756256ca30eed0a21e180a4 | |
parent | 41b462e9b29d62dc197be6d8a633c1b9a46688cf (diff) |
Add markov-generate to load-mails
- Allows to generat mails based
on a sample mails
- use it with:
pixelated-maintenance markov-generate --seed 21 --limit 10
-rw-r--r-- | service/pixelated/config/arguments.py | 5 | ||||
-rw-r--r-- | service/pixelated/maintenance.py | 96 | ||||
-rw-r--r-- | service/pixelated/support/mail_generator.py | 150 | ||||
-rw-r--r-- | service/pixelated/support/markov.py | 94 | ||||
-rw-r--r-- | service/test/unit/fixtures/mbox | 17 | ||||
-rw-r--r-- | service/test/unit/maintenance/test_commands.py | 24 | ||||
-rw-r--r-- | service/test/unit/support/mail_generator_test.py | 42 | ||||
-rw-r--r-- | service/test/unit/support/test_markov.py | 83 |
8 files changed, 493 insertions, 18 deletions
diff --git a/service/pixelated/config/arguments.py b/service/pixelated/config/arguments.py index 7a7abe49..87484b9b 100644 --- a/service/pixelated/config/arguments.py +++ b/service/pixelated/config/arguments.py @@ -41,6 +41,11 @@ def parse_maintenance_args(): mails_parser = subparsers.add_parser('load-mails', help='load mails into account') mails_parser.add_argument('file', nargs='+', help='file(s) with mail data') + markov_mails_parser = subparsers.add_parser('markov-generate', help='generate mails using markov chains') + markov_mails_parser.add_argument('--seed', default=None, help='Specify a seed to always generate the same output') + markov_mails_parser.add_argument('-l', '--limit', metavar='count', default='5', help='limit number of generated mails', dest='limit') + markov_mails_parser.add_argument('file', nargs='+', help='file(s) with mail data') + subparsers.add_parser('dump-soledad', help='dump the soledad database') subparsers.add_parser('sync', help='sync the soledad database') subparsers.add_parser('repair', help='repair database if possible') diff --git a/service/pixelated/maintenance.py b/service/pixelated/maintenance.py index f011658d..b18e881d 100644 --- a/service/pixelated/maintenance.py +++ b/service/pixelated/maintenance.py @@ -15,7 +15,9 @@ # along with Pixelated. If not, see <http://www.gnu.org/licenses/>. import logging -from mailbox import Maildir +from os.path import isfile +from mailbox import Maildir, mbox, MaildirMessage +import random from twisted.internet import reactor, defer from twisted.internet.threads import deferToThread from pixelated.adapter.mailstore.maintenance import SoledadMaintenance @@ -23,7 +25,7 @@ from pixelated.config.leap import initialize_leap from pixelated.config import logger, arguments from leap.mail.constants import MessageFlags - +from pixelated.support.mail_generator import MailGenerator REPAIR_COMMAND = 'repair' @@ -98,6 +100,9 @@ def add_command_callback(args, prepareDeferred, finalizeDeferred): elif args.command == 'load-mails': prepareDeferred.addCallback(load_mails, args.file) prepareDeferred.addCallback(flush_to_soledad, finalizeDeferred) + elif args.command == 'markov-generate': + prepareDeferred.addCallback(markov_generate, args.file, int(args.limit), args.seed) + prepareDeferred.addCallback(flush_to_soledad, finalizeDeferred) elif args.command == 'dump-soledad': prepareDeferred.addCallback(dump_soledad) prepareDeferred.chainDeferred(finalizeDeferred) @@ -130,22 +135,46 @@ def is_keep_file(mail): return mail['subject'] is None +def _is_new_mail(mail): + return _is_maildir_msg(mail) and mail.get_subdir() == 'new' + + +def _is_maildir_msg(mail): + return isinstance(mail, MaildirMessage) + + @defer.inlineCallbacks -def add_mail_folder(store, maildir, folder_name, deferreds): +def _add_mail(store, folder_name, mail, flags, tags): + created_mail = yield store.add_mail(folder_name, mail.as_string()) + leap_mail = yield store.get_mail(created_mail.mail_id) + leap_mail.tags |= set(tags) + for flag in flags: + leap_mail.flags.add(flag) + + yield store.update_mail(leap_mail) + + +@defer.inlineCallbacks +def add_mail_folder(store, mailbox, folder_name, deferreds): yield store.add_mailbox(folder_name) - for mail in maildir: + for mail in mailbox: if is_keep_file(mail): continue - flags = (MessageFlags.RECENT_FLAG,) if mail.get_subdir() == 'new' else () - if 'S' in mail.get_flags(): - flags = (MessageFlags.SEEN_FLAG,) + flags - if 'R' in mail.get_flags(): - flags = (MessageFlags.ANSWERED_FLAG,) + flags + if _is_maildir_msg(mail): + flags = {MessageFlags.RECENT_FLAG} if _is_new_mail(mail) else set() + + if 'S' in mail.get_flags(): + flags = flags.add(MessageFlags.SEEN_FLAG) + if 'R' in mail.get_flags(): + flags = flags.add(MessageFlags.ANSWERED_FLAG) + else: + flags = {MessageFlags.RECENT_FLAG} + + tags = mail['X-Tags'].split() if mail['X-Tags'] else [] - deferreds.append(store.add_mail(folder_name, mail.as_string())) - # FIXME support flags + deferreds.append(_add_mail(store, folder_name, mail, flags, tags)) @defer.inlineCallbacks @@ -153,20 +182,55 @@ def load_mails(args, mail_paths): leap_session, soledad = args store = leap_session.mail_store + yield _load_mails_as_is(mail_paths, store) + + defer.returnValue(args) + + +@defer.inlineCallbacks +def _load_mails_as_is(mail_paths, store): deferreds = [] for path in mail_paths: - maildir = Maildir(path, factory=None) - yield add_mail_folder(store, maildir, 'INBOX', deferreds) - for mail_folder_name in maildir.list_folders(): - mail_folder = maildir.get_folder(mail_folder_name) - yield add_mail_folder(store, mail_folder, mail_folder_name, deferreds) + if isfile(path): + mbox_mails = mbox(path, factory=None) + yield add_mail_folder(store, mbox_mails, 'INBOX', deferreds) + else: + maildir = Maildir(path, factory=None) + yield add_mail_folder(store, maildir, 'INBOX', deferreds) + for mail_folder_name in maildir.list_folders(): + mail_folder = maildir.get_folder(mail_folder_name) + yield add_mail_folder(store, mail_folder, mail_folder_name, deferreds) + + yield defer.gatherResults(deferreds, consumeErrors=True) + + +@defer.inlineCallbacks +def markov_generate(args, mail_paths, limit, seed): + leap_session, soledad = args + store = leap_session.mail_store + username = leap_session.user_auth.username + server_name = leap_session.provider.server_name + + markov_mails = _generate_mails(limit, mail_paths, seed, server_name, username) + deferreds = [] + yield add_mail_folder(store, markov_mails, 'INBOX', deferreds) yield defer.gatherResults(deferreds, consumeErrors=True) defer.returnValue(args) +def _generate_mails(limit, mail_paths, seed, server_name, username): + mails = [] + for path in mail_paths: + mbox_mails = mbox(path, factory=None) + mails.extend(mbox_mails) + gen = MailGenerator(username, server_name, mails, random=random.Random(seed)) + markov_mails = [gen.generate_mail() for _ in range(limit)] + return markov_mails + + def flush_to_soledad(args, finalize): leap_session, soledad = args diff --git a/service/pixelated/support/mail_generator.py b/service/pixelated/support/mail_generator.py new file mode 100644 index 00000000..af8dd4cc --- /dev/null +++ b/service/pixelated/support/mail_generator.py @@ -0,0 +1,150 @@ +# +# Copyright (c) 2015 ThoughtWorks, Inc. +# +# Pixelated is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Pixelated is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with Pixelated. If not, see <http://www.gnu.org/licenses/>. + + +from email.mime.text import MIMEText +from email.utils import formatdate +from random import Random +from pixelated.support.markov import MarkovGenerator +import re +from collections import Counter +import time + + +def filter_two_line_on_wrote(lines): + skip_next = False + if len(lines) > 0: + for i in xrange(len(lines) - 1): + if skip_next: + skip_next = False + continue + + if lines[i].startswith('On') and lines[i + 1].endswith('wrote:'): + skip_next = True + else: + yield lines[i].strip() + + yield lines[-1] + + +def filter_lines(text): + pattern = re.compile('\s*[>-].*') + wrote_pattern = re.compile('\s*On.*wrote.*') + + lines = text.splitlines() + + lines = filter(lambda line: not pattern.match(line), lines) + lines = filter(lambda line: not len(line.strip()) == 0, lines) + lines = filter(lambda line: not wrote_pattern.match(line), lines) + lines = filter(lambda line: not line.endswith('writes:'), lines) + lines = filter(lambda line: ' ' in line.strip(), lines) + + lines = filter_two_line_on_wrote(lines) + + return ' '.join(lines) + + +def decode_multipart_mail_text(mail): + for payload in mail.get_payload(): + if payload.get_content_type() == 'text/plain': + return payload.get_payload(decode=True) + return '' + + +def search_for_tags(content): + words = content.split() + + only_alnum = filter(lambda word: word.isalnum(), words) + only_longer = filter(lambda word: len(word) > 5, only_alnum) + lower_case = map(lambda word: word.lower(), only_longer) + + counter = Counter(lower_case) + potential_tags = counter.most_common(10) + + return map(lambda tag: tag[0], potential_tags) + + +def load_all_mails(mail_list): + subjects = set() + mail_bodies = [] + + for mail in mail_list: + subjects.add(mail['Subject']) + if mail.is_multipart(): + mail_bodies.append(filter_lines(decode_multipart_mail_text(mail))) + else: + if mail.get_content_type() == 'text/plain': + mail_bodies.append(filter_lines(mail.get_payload(decode=True))) + else: + raise Exception(mail.get_content_type()) + + return subjects, mail_bodies + + +class MailGenerator(object): + + NAMES = ['alice', 'bob', 'eve'] + + def __init__(self, receiver, domain_name, sample_mail_list, random=None): + self._random = random if random else Random() + self._receiver = receiver + self._domain_name = domain_name + self._subjects, self._bodies = load_all_mails(sample_mail_list) + + self._potential_tags = search_for_tags(' '.join(self._bodies)) + self._subject_markov = MarkovGenerator(self._subjects, random=self._random) + self._body_markov = MarkovGenerator(self._bodies, random=self._random, add_paragraph_on_empty_chain=True) + + def generate_mail(self): + body = self._body_markov.generate(150) + mail = MIMEText(body) + + mail['Subject'] = self._subject_markov.generate(8) + mail['To'] = '%s@%s' % (self._receiver, self._domain_name) + mail['From'] = self._random_from() + mail['Date'] = self._random_date() + mail['X-Tags'] = self._random_tags() + mail['X-Leap-Encryption'] = self._random_encryption_state() + mail['X-Leap-Signature'] = self._random_signature_state() + + return mail + + def _random_date(self): + now = int(time.time()) + ten_days = 60 * 60 * 24 * 10 + mail_time = self._random.randint(now - ten_days, now) + + return formatdate(mail_time) + + def _random_encryption_state(self): + return self._random.choice(['true', 'decrypted']) + + def _random_signature_state(self): + return self._random.choice(['could not verify', 'valid']) + + def _random_from(self): + name = self._random.choice(filter(lambda name: name != self._receiver, MailGenerator.NAMES)) + + return '%s@%s' % (name, self._domain_name) + + def _random_tags(self): + barrier = 0.5 + tags = set() + while self._random.random() > barrier: + tags.add(self._random.choice(self._potential_tags)) + barrier += 0.15 + + return ' '.join(tags) diff --git a/service/pixelated/support/markov.py b/service/pixelated/support/markov.py new file mode 100644 index 00000000..8f7c0ef3 --- /dev/null +++ b/service/pixelated/support/markov.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2015 ThoughtWorks, Inc. +# +# Pixelated is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Pixelated is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with Pixelated. If not, see <http://www.gnu.org/licenses/>. + +from random import Random + +NEW_PARAGRAPH = '\n\n' + + +class MarkovGenerator(object): + + def __init__(self, texts, random=None, add_paragraph_on_empty_chain=False): + self._markov_chain = {} + self._random = random if random else Random() + self._add_paragraph_on_empty_chain = add_paragraph_on_empty_chain + + for text in filter(lambda _: _ is not None, texts): + self._extend_chain_with(text) + + def add(self, text): + self._extend_chain_with(text) + + @staticmethod + def _triplet_generator(words): + if len(words) < 3: + raise ValueError('Expected input with at least three words') + + for i in xrange(len(words) - 2): + yield ((words[i], words[i + 1]), words[i + 2]) + + def _extend_chain_with(self, input_text): + words = input_text.split() + gen = self._triplet_generator(words) + + for key, value in gen: + if key in self._markov_chain: + self._markov_chain[key].add(value) + else: + self._markov_chain[key] = {value} + + def _generate_chain(self, length): + seed_pair = self._find_good_seed() + word, next_word = seed_pair + new_seed = False + + for i in xrange(length): + yield word + + if new_seed: + word, next_word = self._find_good_seed() + if self._add_paragraph_on_empty_chain: + yield NEW_PARAGRAPH + new_seed = False + else: + prev_word, word = word, next_word + + try: + next_word = self._random_next_word(prev_word, word) + except KeyError: + new_seed = True + + def _random_next_word(self, prev_word, word): + return self._random.choice(list(self._markov_chain[(prev_word, word)])) + + def _find_good_seed(self): + max_tries = len(self._markov_chain.keys()) + try_count = 0 + + seed_pair = self._random.choice(self._markov_chain.keys()) + while not seed_pair[0][0].isupper() and try_count <= max_tries: + seed_pair = self._random.choice(self._markov_chain.keys()) + try_count += 1 + + if try_count > max_tries: + raise ValueError('Not able find start word with captial letter') + + return seed_pair + + def generate(self, length): + if len(self._markov_chain.keys()) == 0: + raise ValueError('Expected at least three words input') + return ' '.join(self._generate_chain(length)) diff --git a/service/test/unit/fixtures/mbox b/service/test/unit/fixtures/mbox new file mode 100644 index 00000000..c3506805 --- /dev/null +++ b/service/test/unit/fixtures/mbox @@ -0,0 +1,17 @@ +From guninski@guninski.com Thu Apr 9 23:42:05 2015 +Subject: Scott's Laws with a longer subject +From: scott@try.pixelated-project.org +To: alice@try.pixelated-project.org +X-Pixelated-encryption-status: true +Date: Tue, 09 Apr 2015 23:42:05 +0000 (UTC) + +Scott's First Law: + No matter what goes wrong, it will probably look right. + +Scott's Second Law: + When an error has been detected and corrected, it will be found + to have been wrong in the first place. +Corollary: + After the correction has been found in error, it will be + impossible to fit the original quantity back into the + equation. diff --git a/service/test/unit/maintenance/test_commands.py b/service/test/unit/maintenance/test_commands.py index 52fe6ca2..a0fc58d6 100644 --- a/service/test/unit/maintenance/test_commands.py +++ b/service/test/unit/maintenance/test_commands.py @@ -80,8 +80,8 @@ class TestCommands(unittest.TestCase): def test_load_mails_adds_mails(self): # given mail_root = pkg_resources.resource_filename('test.unit.fixtures', 'mailset') - firstMailDeferred = defer.succeed(None) - secondMailDeferred = defer.succeed(None) + firstMailDeferred = defer.succeed(MagicMock()) + secondMailDeferred = defer.succeed(MagicMock()) self.mail_store.add_mail.side_effect = [firstMailDeferred, secondMailDeferred] self.mail_store.add_mailbox.return_value = defer.succeed(None) @@ -104,6 +104,26 @@ class TestCommands(unittest.TestCase): return d + def test_load_mails_supports_mbox(self): + # given + mbox_file = pkg_resources.resource_filename('test.unit.fixtures', 'mbox') + + d = load_mails(self.args, [mbox_file]) + + # then + def assert_mails_added(_): + self.assertTrue(self.mail_store.add_mail.called) + self.mail_store.add_mail.assert_any_call('INBOX', self._mail_content(mbox_file)) + + def error_callack(err): + print err + self.assertTrue(False) + + d.addCallback(assert_mails_added) + d.addErrback(error_callack) + + return d + def _mail_content(self, mail_file): with open(mail_file, 'r') as fp: m = email.message_from_file(fp) diff --git a/service/test/unit/support/mail_generator_test.py b/service/test/unit/support/mail_generator_test.py new file mode 100644 index 00000000..9d604378 --- /dev/null +++ b/service/test/unit/support/mail_generator_test.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2015 ThoughtWorks, Inc. +# +# Pixelated is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Pixelated is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with Pixelated. If not, see <http://www.gnu.org/licenses/>. +from mailbox import mbox +import unittest +import pkg_resources +import random +from mock import patch + +from pixelated.support.mail_generator import MailGenerator + + +class MailGeneratorTest(unittest.TestCase): + + def test_generator(self): + mail = """Content-Type: text/plain; charset="us-ascii"\nMIME-Version: 1.0\nContent-Transfer-Encoding: 7bit\nSubject: Scott\'s Laws with a longer subject Scott\'s Laws\nTo: alice@domain.test\nFrom: bob@domain.test\nDate: Sun, 18 Oct 2015 21:45:13 -0000\nX-Tags: \nX-Leap-Encryption: true\nX-Leap-Signature: valid\n\nFirst Law: No matter what goes wrong, it will probably look right. Scott\'s Second Law: When an error has been detected and corrected, it will probably look right. Scott\'s Second Law: When an error has been found in error, it will probably look right. Scott\'s Second Law: When an error has been found in error, it will probably look right. Scott\'s Second Law: When an error has been found in error, it will be impossible to fit the original quantity back into the \n\n First Law: No matter what goes wrong, it will be impossible to fit the original quantity back into the \n\n Scott\'s Second Law: When an error has been found in error, it will be found to have been wrong in the first place. After the correction has been found in error, it will be impossible to fit the original quantity back into the \n\n Second Law: When an error""" + receiver = 'alice' + domain_name = 'domain.test' + mbox_file = pkg_resources.resource_filename('test.unit.fixtures', 'mbox') + mails = mbox(mbox_file) + rnd = random.Random(0) + + with patch('pixelated.support.mail_generator.time.time') as time_mock: + time_mock.return_value = 1446029232.636018 + + gen = MailGenerator(receiver, domain_name, mails, rnd) + + result = gen.generate_mail() + + self.assertEqual(mail, result.as_string()) diff --git a/service/test/unit/support/test_markov.py b/service/test/unit/support/test_markov.py new file mode 100644 index 00000000..f0b0277d --- /dev/null +++ b/service/test/unit/support/test_markov.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2015 ThoughtWorks, Inc. +# +# Pixelated is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Pixelated is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with Pixelated. If not, see <http://www.gnu.org/licenses/>. + + +import unittest +from pixelated.support.markov import MarkovGenerator +import random + + +SAMPLE_TEXT = 'One two three four' + + +class MarkovGeneratorTest(unittest.TestCase): + + def setUp(self): + self.random = random.Random(0) + + def test_starts_with_capital_case_workd(self): + gen = MarkovGenerator(['lower Upper smaller Capital'], random=self.random) + + result = gen.generate(1) + + self.assertTrue(result.startswith('Upper')) + + def test_aborts_if_no_upper_letter_word_found(self): + gen = MarkovGenerator(['all lower case'], random=self.random) + + self.assertRaises(ValueError, gen.generate, 1) + + def test_generate(self): + gen = MarkovGenerator([SAMPLE_TEXT], random=self.random) + + result = gen.generate(3) + + self.assertEqual('One two three', result) + + def test_minimum_three_words(self): + self.assertRaises(ValueError, MarkovGenerator([]).generate, 1) + self.assertRaises(ValueError, MarkovGenerator, ['1']) + self.assertRaises(ValueError, MarkovGenerator, ['1', '2']) + self.assertRaises(ValueError, MarkovGenerator, ['1', '2', '3']) + + def test_add_paragraph_on_empty_chain(self): + gen = MarkovGenerator([SAMPLE_TEXT], random=self.random, add_paragraph_on_empty_chain=True) + + result = gen.generate(5) + + self.assertEqual('One two three four \n\n One', result) + + def test_multiple_inputs(self): + gen = MarkovGenerator([SAMPLE_TEXT, 'Five Six seven eight'], random=self.random) + + result = gen.generate(3) + + self.assertEqual('Five Six seven', result) + + def test_add(self): + gen = MarkovGenerator([], random=self.random) + + gen.add(SAMPLE_TEXT) + result = gen.generate(3) + + self.assertEqual('One two three', result) + + def test_multiple_word_occurences(self): + gen = MarkovGenerator(['One Two One Three One Two One Four'], random=self.random) + + result = gen.generate(2) + + self.assertEqual('Two One', result) |