Improve encoding detection.

This commit is contained in:
Kovid Goyal 2008-02-25 18:27:27 +00:00
parent dc11f5051f
commit 0511622a45
3 changed files with 9 additions and 5 deletions

View File

@ -28,13 +28,15 @@ def detect(aBuf):
return u.result
# Added by Kovid
def xml_to_unicode(raw):
def xml_to_unicode(raw, verbose=False):
'''
Force conversion of byte string to unicode. Tries to llok for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
'''
if not raw:
return u'', None
encoding = None
if isinstance(raw, unicode):
return raw, encoding
@ -46,7 +48,7 @@ def xml_to_unicode(raw):
if encoding is None:
chardet = detect(raw)
encoding = chardet['encoding']
if chardet['confidence'] < 1:
if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }

View File

@ -360,7 +360,7 @@ class HTMLConverter(object):
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
raw = raw.decode('utf-8', 'ignore')
else:
raw = xml_to_unicode(raw)[0]
raw = xml_to_unicode(raw, self.verbose)[0]
f.close()
soup = self.preprocess(raw)
self.logger.info('\tConverting to BBeB...')

View File

@ -23,6 +23,7 @@ from optparse import OptionParser
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers, \
browser, sanitize_file_name
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.chardet import xml_to_unicode
class FetchError(Exception):
pass
@ -58,6 +59,7 @@ class RecursiveFetcher(object):
os.makedirs(self.base_dir)
self.default_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(options.timeout)
self.verbose = options.verbose
self.browser = options.browser if hasattr(options, 'browser') else browser()
self.max_recursions = options.max_recursions
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
@ -77,7 +79,7 @@ class RecursiveFetcher(object):
def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
return BeautifulSoup(src, markupMassage=nmassage)
return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
def fetch_url(self, url):
f = None
@ -97,7 +99,7 @@ class RecursiveFetcher(object):
def start_fetch(self, url):
soup = BeautifulSoup('<a href="'+url+'" />')
soup = BeautifulSoup(u'<a href="'+url+'" />')
self.logger.info('Downloading')
res = self.process_links(soup, url, 0, into_dir='')
self.logger.info('%s saved to %s', url, res)