This commit is contained in:
Kovid Goyal 2007-09-23 06:36:37 +00:00
parent 5a21453397
commit ffd3ac2b21

View File

@ -196,10 +196,18 @@ class HTMLConverter(object):
raw = open(self.file_name, 'rb').read() raw = open(self.file_name, 'rb').read()
if self.pdftohtml: if self.pdftohtml:
nmassage.extend(HTMLConverter.PDFTOHTML) nmassage.extend(HTMLConverter.PDFTOHTML)
raw = unicode(raw, 'utf8', 'replace') #raw = unicode(raw, 'utf8', 'replace')
soup = BeautifulSoup(raw, try:
soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES, convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage) markupMassage=nmassage)
except ConversionError, err:
if 'Failed to coerce to unicode' in str(err):
raw = unicode(raw, 'utf8', 'replace')
soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
if not self.baen and self.is_baen(soup): if not self.baen and self.is_baen(soup):
self.baen = True self.baen = True
self.logger.info('Baen file detected. Re-parsing...') self.logger.info('Baen file detected. Re-parsing...')