mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improve encoding detection.
This commit is contained in:
parent
dc11f5051f
commit
0511622a45
@ -28,13 +28,15 @@ def detect(aBuf):
|
||||
return u.result
|
||||
|
||||
# Added by Kovid
|
||||
def xml_to_unicode(raw):
|
||||
def xml_to_unicode(raw, verbose=False):
|
||||
'''
|
||||
Force conversion of byte string to unicode. Tries to llok for XML/HTML
|
||||
encoding declaration first, if not found uses the chardet library and
|
||||
prints a warning if detection confidence is < 100%
|
||||
@return: (unicode, encoding used)
|
||||
'''
|
||||
if not raw:
|
||||
return u'', None
|
||||
encoding = None
|
||||
if isinstance(raw, unicode):
|
||||
return raw, encoding
|
||||
@ -46,7 +48,7 @@ def xml_to_unicode(raw):
|
||||
if encoding is None:
|
||||
chardet = detect(raw)
|
||||
encoding = chardet['encoding']
|
||||
if chardet['confidence'] < 1:
|
||||
if chardet['confidence'] < 1 and verbose:
|
||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis" }
|
||||
|
@ -360,7 +360,7 @@ class HTMLConverter(object):
|
||||
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
|
||||
raw = raw.decode('utf-8', 'ignore')
|
||||
else:
|
||||
raw = xml_to_unicode(raw)[0]
|
||||
raw = xml_to_unicode(raw, self.verbose)[0]
|
||||
f.close()
|
||||
soup = self.preprocess(raw)
|
||||
self.logger.info('\tConverting to BBeB...')
|
||||
|
@ -23,6 +23,7 @@ from optparse import OptionParser
|
||||
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers, \
|
||||
browser, sanitize_file_name
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from libprs500.ebooks.chardet import xml_to_unicode
|
||||
|
||||
class FetchError(Exception):
|
||||
pass
|
||||
@ -58,6 +59,7 @@ class RecursiveFetcher(object):
|
||||
os.makedirs(self.base_dir)
|
||||
self.default_timeout = socket.getdefaulttimeout()
|
||||
socket.setdefaulttimeout(options.timeout)
|
||||
self.verbose = options.verbose
|
||||
self.browser = options.browser if hasattr(options, 'browser') else browser()
|
||||
self.max_recursions = options.max_recursions
|
||||
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
|
||||
@ -77,7 +79,7 @@ class RecursiveFetcher(object):
|
||||
def get_soup(self, src):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
return BeautifulSoup(src, markupMassage=nmassage)
|
||||
return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
|
||||
|
||||
def fetch_url(self, url):
|
||||
f = None
|
||||
@ -97,7 +99,7 @@ class RecursiveFetcher(object):
|
||||
|
||||
|
||||
def start_fetch(self, url):
|
||||
soup = BeautifulSoup('<a href="'+url+'" />')
|
||||
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
||||
self.logger.info('Downloading')
|
||||
res = self.process_links(soup, url, 0, into_dir='')
|
||||
self.logger.info('%s saved to %s', url, res)
|
||||
|
Loading…
x
Reference in New Issue
Block a user