diff options
author | Kali Kaneko <kali@leap.se> | 2014-02-20 02:52:17 -0400 |
---|---|---|
committer | Kali Kaneko <kali@leap.se> | 2014-02-20 16:17:25 -0400 |
commit | b2d97c9faef6037a065e2903afe5b0ab2624917e (patch) | |
tree | 268a86e5a668e604bfa8907a8d0f4a7025819238 /src/leap/mail/imap/parser.py | |
parent | 66c5602b77547ec24674f5e40c1d244f28ff5a49 (diff) |
mail parsing performance improvements
Although the do_parse function is deferred to threads, we were actually
waiting till its return to fire the callback of the deferred, and hence
the "append ok" was being delayed. During massive appends, this was a
tight loop contributing as much as 35 msec, of a total of 100 msec
average.
Several ineficiencies are addressed here:
* use pycryptopp hash functions.
* avoiding function calling overhead.
* avoid duplicate call to message.as_string
* make use of the string size caching capabilities.
* avoiding the mail Parser initialization/method call completely,
in favor of the module helper to get the object from string.
Overall, these changes cut parsing to 50% of the initial timing by my
measurements with line_profiler, YMMV.
Diffstat (limited to 'src/leap/mail/imap/parser.py')
-rw-r--r-- | src/leap/mail/imap/parser.py | 75 |
1 files changed, 1 insertions, 74 deletions
diff --git a/src/leap/mail/imap/parser.py b/src/leap/mail/imap/parser.py index 6a9ace9..4a801b0 100644 --- a/src/leap/mail/imap/parser.py +++ b/src/leap/mail/imap/parser.py @@ -15,83 +15,10 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """ -Mail parser mixins. +Mail parser mixin. """ -import cStringIO -import StringIO -import hashlib import re -from email.message import Message -from email.parser import Parser - -from leap.common.check import leap_assert_type - - -class MailParser(object): - """ - Mixin with utility methods to parse raw messages. - """ - def __init__(self): - """ - Initializes the mail parser. - """ - self._parser = Parser() - - def _get_parsed_msg(self, raw, headersonly=False): - """ - Return a parsed Message. - - :param raw: the raw string to parse - :type raw: basestring, or StringIO object - - :param headersonly: True for parsing only the headers. - :type headersonly: bool - """ - msg = self._get_parser_fun(raw)(raw, headersonly=headersonly) - return msg - - def _get_hash(self, msg): - """ - Returns a hash of the string representation of the raw message, - suitable for indexing the inmutable pieces. - - :param msg: a Message object - :type msg: Message - """ - leap_assert_type(msg, Message) - return hashlib.sha256(msg.as_string()).hexdigest() - - def _get_parser_fun(self, o): - """ - Retunn the proper parser function for an object. - - :param o: object - :type o: object - :param parser: an instance of email.parser.Parser - :type parser: email.parser.Parser - """ - if isinstance(o, (cStringIO.OutputType, StringIO.StringIO)): - return self._parser.parse - if isinstance(o, basestring): - return self._parser.parsestr - # fallback - return self._parser.parsestr - - def _stringify(self, o): - """ - Return a string object. - - :param o: object - :type o: object - """ - # XXX Maybe we don't need no more, we're using - # msg.as_string() - if isinstance(o, (cStringIO.OutputType, StringIO.StringIO)): - return o.getvalue() - else: - return o - class MBoxParser(object): """ |