From 0511622a4577ac9dfd2865a2633c6c132059c0db Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Feb 2008 18:27:27 +0000 Subject: [PATCH] Improve encoding detection. --- src/libprs500/ebooks/chardet/__init__.py | 6 ++++-- src/libprs500/ebooks/lrf/html/convert_from.py | 2 +- src/libprs500/web/fetch/simple.py | 6 ++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/libprs500/ebooks/chardet/__init__.py b/src/libprs500/ebooks/chardet/__init__.py index 646686e6fb..83763bca1c 100644 --- a/src/libprs500/ebooks/chardet/__init__.py +++ b/src/libprs500/ebooks/chardet/__init__.py @@ -28,13 +28,15 @@ def detect(aBuf): return u.result # Added by Kovid -def xml_to_unicode(raw): +def xml_to_unicode(raw, verbose=False): ''' Force conversion of byte string to unicode. Tries to llok for XML/HTML encoding declaration first, if not found uses the chardet library and prints a warning if detection confidence is < 100% @return: (unicode, encoding used) ''' + if not raw: + return u'', None encoding = None if isinstance(raw, unicode): return raw, encoding @@ -46,7 +48,7 @@ def xml_to_unicode(raw): if encoding is None: chardet = detect(raw) encoding = chardet['encoding'] - if chardet['confidence'] < 1: + if chardet['confidence'] < 1 and verbose: print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 584a72b564..02451d7e50 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -360,7 +360,7 @@ class HTMLConverter(object): if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files raw = raw.decode('utf-8', 'ignore') else: - raw = xml_to_unicode(raw)[0] + raw = xml_to_unicode(raw, self.verbose)[0] f.close() soup = self.preprocess(raw) self.logger.info('\tConverting to BBeB...') diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index b03a3d6f1d..cd7bd98f0b 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -23,6 +23,7 @@ from optparse import OptionParser from libprs500 import __version__, __appname__, __author__, setup_cli_handlers, \ browser, sanitize_file_name from libprs500.ebooks.BeautifulSoup import BeautifulSoup +from libprs500.ebooks.chardet import xml_to_unicode class FetchError(Exception): pass @@ -58,6 +59,7 @@ class RecursiveFetcher(object): os.makedirs(self.base_dir) self.default_timeout = socket.getdefaulttimeout() socket.setdefaulttimeout(options.timeout) + self.verbose = options.verbose self.browser = options.browser if hasattr(options, 'browser') else browser() self.max_recursions = options.max_recursions self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps] @@ -77,7 +79,7 @@ class RecursiveFetcher(object): def get_soup(self, src): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) - return BeautifulSoup(src, markupMassage=nmassage) + return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage) def fetch_url(self, url): f = None @@ -97,7 +99,7 @@ class RecursiveFetcher(object): def start_fetch(self, url): - soup = BeautifulSoup('') + soup = BeautifulSoup(u'') self.logger.info('Downloading') res = self.process_links(soup, url, 0, into_dir='') self.logger.info('%s saved to %s', url, res)