Improve encoding detection.

2025-07-09 03:04:10 -04:00 · 2008-02-25 18:27:27 +00:00 · 2008-02-25 18:27:27 +00:00 · 0511622a45
commit 0511622a45
parent dc11f5051f
3 changed files with 9 additions and 5 deletions
--- a/src/libprs500/ebooks/chardet/init.py
+++ b/src/libprs500/ebooks/chardet/init.py
@ -28,13 +28,15 @@ def detect(aBuf):
    return u.result

 # Added by Kovid
-def xml_to_unicode(raw):
+def xml_to_unicode(raw, verbose=False):
    '''
    Force conversion of byte string to unicode. Tries to llok for XML/HTML 
    encoding declaration first, if not found uses the chardet library and
    prints a warning if detection confidence is < 100%
    @return: (unicode, encoding used) 
    '''
+    if not raw:
+        return u'', None
    encoding = None
    if isinstance(raw, unicode):
        return raw, encoding
@ -46,7 +48,7 @@ def xml_to_unicode(raw):
    if encoding is None:
        chardet = detect(raw)
        encoding = chardet['encoding']
-        if chardet['confidence'] < 1:
+        if chardet['confidence'] < 1 and verbose:
            print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
    CHARSET_ALIASES = { "macintosh" : "mac-roman",
                        "x-sjis" : "shift-jis" }
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -360,7 +360,7 @@ class HTMLConverter(object):
        if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
            raw = raw.decode('utf-8', 'ignore')
        else:
-            raw = xml_to_unicode(raw)[0]
+            raw = xml_to_unicode(raw, self.verbose)[0]
        f.close()
        soup = self.preprocess(raw)
        self.logger.info('\tConverting to BBeB...')
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -23,6 +23,7 @@ from optparse import OptionParser
 from libprs500 import __version__, __appname__, __author__, setup_cli_handlers, \
                      browser, sanitize_file_name
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.ebooks.chardet import xml_to_unicode

 class FetchError(Exception):
    pass
@ -58,6 +59,7 @@ class RecursiveFetcher(object):
            os.makedirs(self.base_dir)
        self.default_timeout = socket.getdefaulttimeout()
        socket.setdefaulttimeout(options.timeout)
+        self.verbose = options.verbose
        self.browser = options.browser if hasattr(options, 'browser') else browser()
        self.max_recursions = options.max_recursions
        self.match_regexps  = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
@ -77,7 +79,7 @@ class RecursiveFetcher(object):
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
-        return BeautifulSoup(src, markupMassage=nmassage)
+        return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)

    def fetch_url(self, url):
        f = None
@ -97,7 +99,7 @@ class RecursiveFetcher(object):

        
    def start_fetch(self, url):
-        soup = BeautifulSoup('<a href="'+url+'" />')
+        soup = BeautifulSoup(u'<a href="'+url+'" />')
        self.logger.info('Downloading')
        res = self.process_links(soup, url, 0, into_dir='')
        self.logger.info('%s saved to %s', url, res)