From 568720334aa630ea504b2ce3b8c324f0a557d6e6 Mon Sep 17 00:00:00 2001 From: k clair Date: Tue, 9 Oct 2012 13:14:36 -0700 Subject: add source files from upstream --- .../requests/packages/chardet/mbcharsetprober.py | 82 ++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100755 requests-0.14.0/requests/packages/chardet/mbcharsetprober.py (limited to 'requests-0.14.0/requests/packages/chardet/mbcharsetprober.py') diff --git a/requests-0.14.0/requests/packages/chardet/mbcharsetprober.py b/requests-0.14.0/requests/packages/chardet/mbcharsetprober.py new file mode 100755 index 0000000..a813144 --- /dev/null +++ b/requests-0.14.0/requests/packages/chardet/mbcharsetprober.py @@ -0,0 +1,82 @@ +######################## BEGIN LICENSE BLOCK ######################## +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Mark Pilgrim - port to Python +# Shy Shalom - original C code +# Proofpoint, Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +import constants, sys +from constants import eStart, eError, eItsMe +from charsetprober import CharSetProber + +class MultiByteCharSetProber(CharSetProber): + def __init__(self): + CharSetProber.__init__(self) + self._mDistributionAnalyzer = None + self._mCodingSM = None + self._mLastChar = ['\x00', '\x00'] + + def reset(self): + CharSetProber.reset(self) + if self._mCodingSM: + self._mCodingSM.reset() + if self._mDistributionAnalyzer: + self._mDistributionAnalyzer.reset() + self._mLastChar = ['\x00', '\x00'] + + def get_charset_name(self): + pass + + def feed(self, aBuf): + aLen = len(aBuf) + for i in range(0, aLen): + codingState = self._mCodingSM.next_state(aBuf[i]) + if codingState == eError: + if constants._debug: + sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') + self._mState = constants.eNotMe + break + elif codingState == eItsMe: + self._mState = constants.eFoundIt + break + elif codingState == eStart: + charLen = self._mCodingSM.get_current_charlen() + if i == 0: + self._mLastChar[1] = aBuf[0] + self._mDistributionAnalyzer.feed(self._mLastChar, charLen) + else: + self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) + + self._mLastChar[0] = aBuf[aLen - 1] + + if self.get_state() == constants.eDetecting: + if self._mDistributionAnalyzer.got_enough_data() and \ + (self.get_confidence() > constants.SHORTCUT_THRESHOLD): + self._mState = constants.eFoundIt + + return self.get_state() + + def get_confidence(self): + return self._mDistributionAnalyzer.get_confidence() -- cgit v1.2.3