summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKali Kaneko <kali@leap.se>2014-02-20 02:52:17 -0400
committerKali Kaneko <kali@leap.se>2014-02-20 16:17:25 -0400
commitb2d97c9faef6037a065e2903afe5b0ab2624917e (patch)
tree268a86e5a668e604bfa8907a8d0f4a7025819238
parent66c5602b77547ec24674f5e40c1d244f28ff5a49 (diff)
mail parsing performance improvements
Although the do_parse function is deferred to threads, we were actually waiting till its return to fire the callback of the deferred, and hence the "append ok" was being delayed. During massive appends, this was a tight loop contributing as much as 35 msec, of a total of 100 msec average. Several ineficiencies are addressed here: * use pycryptopp hash functions. * avoiding function calling overhead. * avoid duplicate call to message.as_string * make use of the string size caching capabilities. * avoiding the mail Parser initialization/method call completely, in favor of the module helper to get the object from string. Overall, these changes cut parsing to 50% of the initial timing by my measurements with line_profiler, YMMV.
-rw-r--r--src/leap/mail/imap/messages.py25
-rw-r--r--src/leap/mail/imap/parser.py75
-rw-r--r--src/leap/mail/imap/soledadstore.py4
-rw-r--r--src/leap/mail/imap/tests/walktree.py4
-rw-r--r--src/leap/mail/walk.py7
5 files changed, 21 insertions, 94 deletions
diff --git a/src/leap/mail/imap/messages.py b/src/leap/mail/imap/messages.py
index 9f7f6e2..9a001b3 100644
--- a/src/leap/mail/imap/messages.py
+++ b/src/leap/mail/imap/messages.py
@@ -24,8 +24,10 @@ import threading
import StringIO
from collections import defaultdict
+from email import message_from_string
from functools import partial
+from pycryptopp.hash import sha256
from twisted.mail import imap4
from twisted.internet import defer
from zope.interface import implements
@@ -42,7 +44,7 @@ from leap.mail.imap.index import IndexedDB
from leap.mail.imap.fields import fields, WithMsgFields
from leap.mail.imap.memorystore import MessageWrapper
from leap.mail.imap.messageparts import MessagePart
-from leap.mail.imap.parser import MailParser, MBoxParser
+from leap.mail.imap.parser import MBoxParser
logger = logging.getLogger(__name__)
@@ -94,7 +96,7 @@ A dictionary that keeps one lock per mbox and uid.
fdoc_locks = defaultdict(lambda: defaultdict(lambda: threading.Lock()))
-class LeapMessage(fields, MailParser, MBoxParser):
+class LeapMessage(fields, MBoxParser):
"""
The main representation of a message.
@@ -123,7 +125,6 @@ class LeapMessage(fields, MailParser, MBoxParser):
:param container: a IMessageContainer implementor instance
:type container: IMessageContainer
"""
- MailParser.__init__(self)
self._soledad = soledad
self._uid = int(uid)
self._mbox = self._parse_mailbox_name(mbox)
@@ -583,7 +584,7 @@ class LeapMessage(fields, MailParser, MBoxParser):
return not empty(self.fdoc)
-class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
+class MessageCollection(WithMsgFields, IndexedDB, MBoxParser):
"""
A collection of messages, surprisingly.
@@ -713,7 +714,6 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
:param memstore: a MemoryStore instance
:type memstore: MemoryStore
"""
- MailParser.__init__(self)
leap_assert(mbox, "Need a mailbox name to initialize")
leap_assert(mbox.strip() != "", "mbox cannot be blank space")
leap_assert(isinstance(mbox, (str, unicode)),
@@ -782,11 +782,11 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
:return: msg, parts, chash, size, multi
:rtype: tuple
"""
- msg = self._get_parsed_msg(raw)
- chash = self._get_hash(msg)
- size = len(msg.as_string())
- multi = msg.is_multipart()
+ msg = message_from_string(raw)
parts = walk.get_parts(msg)
+ size = len(raw)
+ chash = sha256.SHA256(raw).hexdigest()
+ multi = msg.is_multipart()
return msg, parts, chash, size, multi
def _populate_flags(self, flags, uid, chash, size, multi):
@@ -803,7 +803,7 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
fd[self.SIZE_KEY] = size
fd[self.MULTIPART_KEY] = multi
if flags:
- fd[self.FLAGS_KEY] = map(self._stringify, flags)
+ fd[self.FLAGS_KEY] = flags
fd[self.SEEN_KEY] = self.SEEN_FLAG in flags
fd[self.DEL_KEY] = self.DELETED_FLAG in flags
fd[self.RECENT_KEY] = True # set always by default
@@ -926,11 +926,10 @@ class MessageCollection(WithMsgFields, IndexedDB, MailParser, MBoxParser):
# Watch out! We're reserving a UID right after this!
existing_uid = self._fdoc_already_exists(chash)
if existing_uid:
- uid = existing_uid
- msg = self.get_msg_by_uid(uid)
+ msg = self.get_msg_by_uid(existing_uid)
# We can say the observer that we're done
- self.reactor.callFromThread(observer.callback, uid)
+ self.reactor.callFromThread(observer.callback, existing_uid)
msg.setFlags((fields.DELETED_FLAG,), -1)
return
diff --git a/src/leap/mail/imap/parser.py b/src/leap/mail/imap/parser.py
index 6a9ace9..4a801b0 100644
--- a/src/leap/mail/imap/parser.py
+++ b/src/leap/mail/imap/parser.py
@@ -15,83 +15,10 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
-Mail parser mixins.
+Mail parser mixin.
"""
-import cStringIO
-import StringIO
-import hashlib
import re
-from email.message import Message
-from email.parser import Parser
-
-from leap.common.check import leap_assert_type
-
-
-class MailParser(object):
- """
- Mixin with utility methods to parse raw messages.
- """
- def __init__(self):
- """
- Initializes the mail parser.
- """
- self._parser = Parser()
-
- def _get_parsed_msg(self, raw, headersonly=False):
- """
- Return a parsed Message.
-
- :param raw: the raw string to parse
- :type raw: basestring, or StringIO object
-
- :param headersonly: True for parsing only the headers.
- :type headersonly: bool
- """
- msg = self._get_parser_fun(raw)(raw, headersonly=headersonly)
- return msg
-
- def _get_hash(self, msg):
- """
- Returns a hash of the string representation of the raw message,
- suitable for indexing the inmutable pieces.
-
- :param msg: a Message object
- :type msg: Message
- """
- leap_assert_type(msg, Message)
- return hashlib.sha256(msg.as_string()).hexdigest()
-
- def _get_parser_fun(self, o):
- """
- Retunn the proper parser function for an object.
-
- :param o: object
- :type o: object
- :param parser: an instance of email.parser.Parser
- :type parser: email.parser.Parser
- """
- if isinstance(o, (cStringIO.OutputType, StringIO.StringIO)):
- return self._parser.parse
- if isinstance(o, basestring):
- return self._parser.parsestr
- # fallback
- return self._parser.parsestr
-
- def _stringify(self, o):
- """
- Return a string object.
-
- :param o: object
- :type o: object
- """
- # XXX Maybe we don't need no more, we're using
- # msg.as_string()
- if isinstance(o, (cStringIO.OutputType, StringIO.StringIO)):
- return o.getvalue()
- else:
- return o
-
class MBoxParser(object):
"""
diff --git a/src/leap/mail/imap/soledadstore.py b/src/leap/mail/imap/soledadstore.py
index 25f00bb..f3de8eb 100644
--- a/src/leap/mail/imap/soledadstore.py
+++ b/src/leap/mail/imap/soledadstore.py
@@ -314,8 +314,8 @@ class SoledadStore(ContentDedup):
except Exception as exc:
logger.debug("ITEM WAS: %s" % repr(item))
if hasattr(item, 'content'):
- logger.debug("ITEM CONTENT WAS: %s" %
- repr(item.content))
+ logger.debug("ITEM CONTENT WAS: %s" %
+ repr(item.content))
logger.exception(exc)
failed = True
continue
diff --git a/src/leap/mail/imap/tests/walktree.py b/src/leap/mail/imap/tests/walktree.py
index f3cbcb0..695f487 100644
--- a/src/leap/mail/imap/tests/walktree.py
+++ b/src/leap/mail/imap/tests/walktree.py
@@ -36,11 +36,11 @@ p = parser.Parser()
if len(sys.argv) > 1:
FILENAME = sys.argv[1]
else:
- FILENAME = "rfc822.multi-minimal.message"
+ FILENAME = "rfc822.multi-signed.message"
"""
-FILENAME = "rfc822.multi-signed.message"
FILENAME = "rfc822.plain.message"
+FILENAME = "rfc822.multi-minimal.message"
"""
msg = p.parse(open(FILENAME))
diff --git a/src/leap/mail/walk.py b/src/leap/mail/walk.py
index 49f2c22..f747377 100644
--- a/src/leap/mail/walk.py
+++ b/src/leap/mail/walk.py
@@ -17,17 +17,18 @@
"""
Utilities for walking along a message tree.
"""
-import hashlib
import os
+from pycryptopp.hash import sha256
+
from leap.mail.utils import first
DEBUG = os.environ.get("BITMASK_MAIL_DEBUG")
if DEBUG:
- get_hash = lambda s: hashlib.sha256(s).hexdigest()[:10]
+ get_hash = lambda s: sha256.SHA256(s).hexdigest()[:10]
else:
- get_hash = lambda s: hashlib.sha256(s).hexdigest()
+ get_hash = lambda s: sha256.SHA256(s).hexdigest()
"""