From f94102bc8733b34bdbfeba54e121a32c14e3fff9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Feb 2008 17:11:02 +0000 Subject: [PATCH] Improve encoding detection. --- src/libprs500/__init__.py | 1 + src/libprs500/ebooks/BeautifulSoup.py | 7 ++--- src/libprs500/ebooks/chardet/__init__.py | 30 +++++++++++++++++++ src/libprs500/ebooks/lrf/html/convert_from.py | 6 ++-- src/libprs500/ebooks/lrf/lrs/convert_from.py | 5 ++-- 5 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index a9a6e21dce..08e7dee80f 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -26,6 +26,7 @@ from optparse import OptionParser as _OptionParser from ttfquery import findsystem, describe from libprs500.translations.msgfmt import make +from libprs500.ebooks.chardet import detect iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower() isosx = 'darwin' in sys.platform.lower() diff --git a/src/libprs500/ebooks/BeautifulSoup.py b/src/libprs500/ebooks/BeautifulSoup.py index f6ebb259d0..f1abe95aaf 100644 --- a/src/libprs500/ebooks/BeautifulSoup.py +++ b/src/libprs500/ebooks/BeautifulSoup.py @@ -1074,11 +1074,8 @@ class BeautifulStoneSoup(Tag, SGMLParser): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: - dammit = UnicodeDammit\ - (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo) - markup = dammit.unicode - self.originalEncoding = dammit.originalEncoding + # Changed detection by Kovid + markup, self.originalEncoding = chardet.xml_to_unicode(markup) if markup: if self.markupMassage: if not isList(self.markupMassage): diff --git a/src/libprs500/ebooks/chardet/__init__.py b/src/libprs500/ebooks/chardet/__init__.py index 9475dae58e..646686e6fb 100644 --- a/src/libprs500/ebooks/chardet/__init__.py +++ b/src/libprs500/ebooks/chardet/__init__.py @@ -17,6 +17,8 @@ __version__ = "1.0" +import re + def detect(aBuf): import universaldetector u = universaldetector.UniversalDetector() @@ -24,3 +26,31 @@ def detect(aBuf): u.feed(aBuf) u.close() return u.result + +# Added by Kovid +def xml_to_unicode(raw): + ''' + Force conversion of byte string to unicode. Tries to llok for XML/HTML + encoding declaration first, if not found uses the chardet library and + prints a warning if detection confidence is < 100% + @return: (unicode, encoding used) + ''' + encoding = None + if isinstance(raw, unicode): + return raw, encoding + match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw) + if match is None: + match = re.compile(r'