diff --git a/src/libprs500/ebooks/chardet/__init__.py b/src/libprs500/ebooks/chardet/__init__.py
index 646686e6fb..83763bca1c 100644
--- a/src/libprs500/ebooks/chardet/__init__.py
+++ b/src/libprs500/ebooks/chardet/__init__.py
@@ -28,13 +28,15 @@ def detect(aBuf):
return u.result
# Added by Kovid
-def xml_to_unicode(raw):
+def xml_to_unicode(raw, verbose=False):
'''
Force conversion of byte string to unicode. Tries to llok for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
'''
+ if not raw:
+ return u'', None
encoding = None
if isinstance(raw, unicode):
return raw, encoding
@@ -46,7 +48,7 @@ def xml_to_unicode(raw):
if encoding is None:
chardet = detect(raw)
encoding = chardet['encoding']
- if chardet['confidence'] < 1:
+ if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index 584a72b564..02451d7e50 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -360,7 +360,7 @@ class HTMLConverter(object):
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
raw = raw.decode('utf-8', 'ignore')
else:
- raw = xml_to_unicode(raw)[0]
+ raw = xml_to_unicode(raw, self.verbose)[0]
f.close()
soup = self.preprocess(raw)
self.logger.info('\tConverting to BBeB...')
diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py
index b03a3d6f1d..cd7bd98f0b 100644
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@@ -23,6 +23,7 @@ from optparse import OptionParser
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers, \
browser, sanitize_file_name
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.ebooks.chardet import xml_to_unicode
class FetchError(Exception):
pass
@@ -58,6 +59,7 @@ class RecursiveFetcher(object):
os.makedirs(self.base_dir)
self.default_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(options.timeout)
+ self.verbose = options.verbose
self.browser = options.browser if hasattr(options, 'browser') else browser()
self.max_recursions = options.max_recursions
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
@@ -77,7 +79,7 @@ class RecursiveFetcher(object):
def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
- return BeautifulSoup(src, markupMassage=nmassage)
+ return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
def fetch_url(self, url):
f = None
@@ -97,7 +99,7 @@ class RecursiveFetcher(object):
def start_fetch(self, url):
- soup = BeautifulSoup('')
+ soup = BeautifulSoup(u'')
self.logger.info('Downloading')
res = self.process_links(soup, url, 0, into_dir='')
self.logger.info('%s saved to %s', url, res)