Fix #2953 (missing ì)

This commit is contained in:
Kovid Goyal 2009-07-27 09:06:00 -06:00
parent 090b322f8d
commit 2330c2a88e

View File

@ -303,6 +303,11 @@ class MobiReader(object):
self.cleanup_html()
self.log.debug('Parsing HTML...')
try:
root = html.fromstring(self.processed_html)
except:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser
@ -444,7 +449,10 @@ class MobiReader(object):
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = re.sub('\x14|\x15|\x1c|\x1d|\xef|\x12|\x13|\xec', '', self.processed_html)
def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x1c|\x1d|\xef|\x12|\x13|\xec',
'', html)
def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None: