From 70c770635199cfa473608162ec7d31e030a11c5f Mon Sep 17 00:00:00 2001 From: Folker Bernitt Date: Wed, 28 Oct 2015 12:05:59 +0100 Subject: Add markov-generate to load-mails - Allows to generat mails based on a sample mails - use it with: pixelated-maintenance markov-generate --seed 21 --limit 10 --- service/pixelated/config/arguments.py | 5 + service/pixelated/maintenance.py | 96 +++++++++++++++--- service/pixelated/support/mail_generator.py | 150 ++++++++++++++++++++++++++++ service/pixelated/support/markov.py | 94 +++++++++++++++++ 4 files changed, 329 insertions(+), 16 deletions(-) create mode 100644 service/pixelated/support/mail_generator.py create mode 100644 service/pixelated/support/markov.py (limited to 'service/pixelated') diff --git a/service/pixelated/config/arguments.py b/service/pixelated/config/arguments.py index 7a7abe49..87484b9b 100644 --- a/service/pixelated/config/arguments.py +++ b/service/pixelated/config/arguments.py @@ -41,6 +41,11 @@ def parse_maintenance_args(): mails_parser = subparsers.add_parser('load-mails', help='load mails into account') mails_parser.add_argument('file', nargs='+', help='file(s) with mail data') + markov_mails_parser = subparsers.add_parser('markov-generate', help='generate mails using markov chains') + markov_mails_parser.add_argument('--seed', default=None, help='Specify a seed to always generate the same output') + markov_mails_parser.add_argument('-l', '--limit', metavar='count', default='5', help='limit number of generated mails', dest='limit') + markov_mails_parser.add_argument('file', nargs='+', help='file(s) with mail data') + subparsers.add_parser('dump-soledad', help='dump the soledad database') subparsers.add_parser('sync', help='sync the soledad database') subparsers.add_parser('repair', help='repair database if possible') diff --git a/service/pixelated/maintenance.py b/service/pixelated/maintenance.py index f011658d..b18e881d 100644 --- a/service/pixelated/maintenance.py +++ b/service/pixelated/maintenance.py @@ -15,7 +15,9 @@ # along with Pixelated. If not, see . import logging -from mailbox import Maildir +from os.path import isfile +from mailbox import Maildir, mbox, MaildirMessage +import random from twisted.internet import reactor, defer from twisted.internet.threads import deferToThread from pixelated.adapter.mailstore.maintenance import SoledadMaintenance @@ -23,7 +25,7 @@ from pixelated.config.leap import initialize_leap from pixelated.config import logger, arguments from leap.mail.constants import MessageFlags - +from pixelated.support.mail_generator import MailGenerator REPAIR_COMMAND = 'repair' @@ -98,6 +100,9 @@ def add_command_callback(args, prepareDeferred, finalizeDeferred): elif args.command == 'load-mails': prepareDeferred.addCallback(load_mails, args.file) prepareDeferred.addCallback(flush_to_soledad, finalizeDeferred) + elif args.command == 'markov-generate': + prepareDeferred.addCallback(markov_generate, args.file, int(args.limit), args.seed) + prepareDeferred.addCallback(flush_to_soledad, finalizeDeferred) elif args.command == 'dump-soledad': prepareDeferred.addCallback(dump_soledad) prepareDeferred.chainDeferred(finalizeDeferred) @@ -130,22 +135,46 @@ def is_keep_file(mail): return mail['subject'] is None +def _is_new_mail(mail): + return _is_maildir_msg(mail) and mail.get_subdir() == 'new' + + +def _is_maildir_msg(mail): + return isinstance(mail, MaildirMessage) + + @defer.inlineCallbacks -def add_mail_folder(store, maildir, folder_name, deferreds): +def _add_mail(store, folder_name, mail, flags, tags): + created_mail = yield store.add_mail(folder_name, mail.as_string()) + leap_mail = yield store.get_mail(created_mail.mail_id) + leap_mail.tags |= set(tags) + for flag in flags: + leap_mail.flags.add(flag) + + yield store.update_mail(leap_mail) + + +@defer.inlineCallbacks +def add_mail_folder(store, mailbox, folder_name, deferreds): yield store.add_mailbox(folder_name) - for mail in maildir: + for mail in mailbox: if is_keep_file(mail): continue - flags = (MessageFlags.RECENT_FLAG,) if mail.get_subdir() == 'new' else () - if 'S' in mail.get_flags(): - flags = (MessageFlags.SEEN_FLAG,) + flags - if 'R' in mail.get_flags(): - flags = (MessageFlags.ANSWERED_FLAG,) + flags + if _is_maildir_msg(mail): + flags = {MessageFlags.RECENT_FLAG} if _is_new_mail(mail) else set() + + if 'S' in mail.get_flags(): + flags = flags.add(MessageFlags.SEEN_FLAG) + if 'R' in mail.get_flags(): + flags = flags.add(MessageFlags.ANSWERED_FLAG) + else: + flags = {MessageFlags.RECENT_FLAG} + + tags = mail['X-Tags'].split() if mail['X-Tags'] else [] - deferreds.append(store.add_mail(folder_name, mail.as_string())) - # FIXME support flags + deferreds.append(_add_mail(store, folder_name, mail, flags, tags)) @defer.inlineCallbacks @@ -153,20 +182,55 @@ def load_mails(args, mail_paths): leap_session, soledad = args store = leap_session.mail_store + yield _load_mails_as_is(mail_paths, store) + + defer.returnValue(args) + + +@defer.inlineCallbacks +def _load_mails_as_is(mail_paths, store): deferreds = [] for path in mail_paths: - maildir = Maildir(path, factory=None) - yield add_mail_folder(store, maildir, 'INBOX', deferreds) - for mail_folder_name in maildir.list_folders(): - mail_folder = maildir.get_folder(mail_folder_name) - yield add_mail_folder(store, mail_folder, mail_folder_name, deferreds) + if isfile(path): + mbox_mails = mbox(path, factory=None) + yield add_mail_folder(store, mbox_mails, 'INBOX', deferreds) + else: + maildir = Maildir(path, factory=None) + yield add_mail_folder(store, maildir, 'INBOX', deferreds) + for mail_folder_name in maildir.list_folders(): + mail_folder = maildir.get_folder(mail_folder_name) + yield add_mail_folder(store, mail_folder, mail_folder_name, deferreds) + + yield defer.gatherResults(deferreds, consumeErrors=True) + + +@defer.inlineCallbacks +def markov_generate(args, mail_paths, limit, seed): + leap_session, soledad = args + store = leap_session.mail_store + username = leap_session.user_auth.username + server_name = leap_session.provider.server_name + + markov_mails = _generate_mails(limit, mail_paths, seed, server_name, username) + deferreds = [] + yield add_mail_folder(store, markov_mails, 'INBOX', deferreds) yield defer.gatherResults(deferreds, consumeErrors=True) defer.returnValue(args) +def _generate_mails(limit, mail_paths, seed, server_name, username): + mails = [] + for path in mail_paths: + mbox_mails = mbox(path, factory=None) + mails.extend(mbox_mails) + gen = MailGenerator(username, server_name, mails, random=random.Random(seed)) + markov_mails = [gen.generate_mail() for _ in range(limit)] + return markov_mails + + def flush_to_soledad(args, finalize): leap_session, soledad = args diff --git a/service/pixelated/support/mail_generator.py b/service/pixelated/support/mail_generator.py new file mode 100644 index 00000000..af8dd4cc --- /dev/null +++ b/service/pixelated/support/mail_generator.py @@ -0,0 +1,150 @@ +# +# Copyright (c) 2015 ThoughtWorks, Inc. +# +# Pixelated is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Pixelated is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with Pixelated. If not, see . + + +from email.mime.text import MIMEText +from email.utils import formatdate +from random import Random +from pixelated.support.markov import MarkovGenerator +import re +from collections import Counter +import time + + +def filter_two_line_on_wrote(lines): + skip_next = False + if len(lines) > 0: + for i in xrange(len(lines) - 1): + if skip_next: + skip_next = False + continue + + if lines[i].startswith('On') and lines[i + 1].endswith('wrote:'): + skip_next = True + else: + yield lines[i].strip() + + yield lines[-1] + + +def filter_lines(text): + pattern = re.compile('\s*[>-].*') + wrote_pattern = re.compile('\s*On.*wrote.*') + + lines = text.splitlines() + + lines = filter(lambda line: not pattern.match(line), lines) + lines = filter(lambda line: not len(line.strip()) == 0, lines) + lines = filter(lambda line: not wrote_pattern.match(line), lines) + lines = filter(lambda line: not line.endswith('writes:'), lines) + lines = filter(lambda line: ' ' in line.strip(), lines) + + lines = filter_two_line_on_wrote(lines) + + return ' '.join(lines) + + +def decode_multipart_mail_text(mail): + for payload in mail.get_payload(): + if payload.get_content_type() == 'text/plain': + return payload.get_payload(decode=True) + return '' + + +def search_for_tags(content): + words = content.split() + + only_alnum = filter(lambda word: word.isalnum(), words) + only_longer = filter(lambda word: len(word) > 5, only_alnum) + lower_case = map(lambda word: word.lower(), only_longer) + + counter = Counter(lower_case) + potential_tags = counter.most_common(10) + + return map(lambda tag: tag[0], potential_tags) + + +def load_all_mails(mail_list): + subjects = set() + mail_bodies = [] + + for mail in mail_list: + subjects.add(mail['Subject']) + if mail.is_multipart(): + mail_bodies.append(filter_lines(decode_multipart_mail_text(mail))) + else: + if mail.get_content_type() == 'text/plain': + mail_bodies.append(filter_lines(mail.get_payload(decode=True))) + else: + raise Exception(mail.get_content_type()) + + return subjects, mail_bodies + + +class MailGenerator(object): + + NAMES = ['alice', 'bob', 'eve'] + + def __init__(self, receiver, domain_name, sample_mail_list, random=None): + self._random = random if random else Random() + self._receiver = receiver + self._domain_name = domain_name + self._subjects, self._bodies = load_all_mails(sample_mail_list) + + self._potential_tags = search_for_tags(' '.join(self._bodies)) + self._subject_markov = MarkovGenerator(self._subjects, random=self._random) + self._body_markov = MarkovGenerator(self._bodies, random=self._random, add_paragraph_on_empty_chain=True) + + def generate_mail(self): + body = self._body_markov.generate(150) + mail = MIMEText(body) + + mail['Subject'] = self._subject_markov.generate(8) + mail['To'] = '%s@%s' % (self._receiver, self._domain_name) + mail['From'] = self._random_from() + mail['Date'] = self._random_date() + mail['X-Tags'] = self._random_tags() + mail['X-Leap-Encryption'] = self._random_encryption_state() + mail['X-Leap-Signature'] = self._random_signature_state() + + return mail + + def _random_date(self): + now = int(time.time()) + ten_days = 60 * 60 * 24 * 10 + mail_time = self._random.randint(now - ten_days, now) + + return formatdate(mail_time) + + def _random_encryption_state(self): + return self._random.choice(['true', 'decrypted']) + + def _random_signature_state(self): + return self._random.choice(['could not verify', 'valid']) + + def _random_from(self): + name = self._random.choice(filter(lambda name: name != self._receiver, MailGenerator.NAMES)) + + return '%s@%s' % (name, self._domain_name) + + def _random_tags(self): + barrier = 0.5 + tags = set() + while self._random.random() > barrier: + tags.add(self._random.choice(self._potential_tags)) + barrier += 0.15 + + return ' '.join(tags) diff --git a/service/pixelated/support/markov.py b/service/pixelated/support/markov.py new file mode 100644 index 00000000..8f7c0ef3 --- /dev/null +++ b/service/pixelated/support/markov.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2015 ThoughtWorks, Inc. +# +# Pixelated is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Pixelated is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with Pixelated. If not, see . + +from random import Random + +NEW_PARAGRAPH = '\n\n' + + +class MarkovGenerator(object): + + def __init__(self, texts, random=None, add_paragraph_on_empty_chain=False): + self._markov_chain = {} + self._random = random if random else Random() + self._add_paragraph_on_empty_chain = add_paragraph_on_empty_chain + + for text in filter(lambda _: _ is not None, texts): + self._extend_chain_with(text) + + def add(self, text): + self._extend_chain_with(text) + + @staticmethod + def _triplet_generator(words): + if len(words) < 3: + raise ValueError('Expected input with at least three words') + + for i in xrange(len(words) - 2): + yield ((words[i], words[i + 1]), words[i + 2]) + + def _extend_chain_with(self, input_text): + words = input_text.split() + gen = self._triplet_generator(words) + + for key, value in gen: + if key in self._markov_chain: + self._markov_chain[key].add(value) + else: + self._markov_chain[key] = {value} + + def _generate_chain(self, length): + seed_pair = self._find_good_seed() + word, next_word = seed_pair + new_seed = False + + for i in xrange(length): + yield word + + if new_seed: + word, next_word = self._find_good_seed() + if self._add_paragraph_on_empty_chain: + yield NEW_PARAGRAPH + new_seed = False + else: + prev_word, word = word, next_word + + try: + next_word = self._random_next_word(prev_word, word) + except KeyError: + new_seed = True + + def _random_next_word(self, prev_word, word): + return self._random.choice(list(self._markov_chain[(prev_word, word)])) + + def _find_good_seed(self): + max_tries = len(self._markov_chain.keys()) + try_count = 0 + + seed_pair = self._random.choice(self._markov_chain.keys()) + while not seed_pair[0][0].isupper() and try_count <= max_tries: + seed_pair = self._random.choice(self._markov_chain.keys()) + try_count += 1 + + if try_count > max_tries: + raise ValueError('Not able find start word with captial letter') + + return seed_pair + + def generate(self, length): + if len(self._markov_chain.keys()) == 0: + raise ValueError('Expected at least three words input') + return ' '.join(self._generate_chain(length)) -- cgit v1.2.3