Fix #3159 (Error Converting mobipocket to sony reader v0.6.7)

This commit is contained in:
Kovid Goyal 2009-08-15 11:40:56 -06:00
parent b1c8d3d9a9
commit bf6f98363e

View File

@ -317,7 +317,13 @@ class MobiReader(object):
if root.xpath('descendant::p/descendant::p'): if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser from lxml.html import soupparser
self.log.warning('Malformed markup, parsing using BeautifulSoup') self.log.warning('Malformed markup, parsing using BeautifulSoup')
root = soupparser.fromstring(self.processed_html) try:
root = soupparser.fromstring(self.processed_html)
except Exception, err:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = soupparser.fromstring(self.processed_html)
if root.tag != 'html': if root.tag != 'html':
self.log.warn('File does not have opening <html> tag') self.log.warn('File does not have opening <html> tag')
@ -457,7 +463,7 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:') self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
def remove_random_bytes(self, html): def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x1c|\x1d|\xef|\x12|\x13|\xec', return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',
'', html) '', html)
def ensure_unit(self, raw, unit='px'): def ensure_unit(self, raw, unit='px'):
@ -492,11 +498,12 @@ class MobiReader(object):
styles.append(style) styles.append(style)
if attrib.has_key('height'): if attrib.has_key('height'):
height = attrib.pop('height').strip() height = attrib.pop('height').strip()
if height: if height and '<' not in height and '>' not in height and \
re.search(r'\d+', height):
styles.append('margin-top: %s' % self.ensure_unit(height)) styles.append('margin-top: %s' % self.ensure_unit(height))
if attrib.has_key('width'): if attrib.has_key('width'):
width = attrib.pop('width').strip() width = attrib.pop('width').strip()
if width: if width and re.search(r'\d+', width):
styles.append('text-indent: %s' % self.ensure_unit(width)) styles.append('text-indent: %s' % self.ensure_unit(width))
if width.startswith('-'): if width.startswith('-'):
styles.append('margin-left: %s' % self.ensure_unit(width[1:])) styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
@ -714,6 +721,9 @@ class MobiReader(object):
self.processed_html += self.mobi_html[pos:end] + (anchor % oend) self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
pos = end pos = end
self.processed_html += self.mobi_html[pos:] self.processed_html += self.mobi_html[pos:]
# Remove anchors placed inside entities
self.processed_html = re.sub(r'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
r'&\1\3;\2', self.processed_html)
def extract_images(self, processed_records, output_dir): def extract_images(self, processed_records, output_dir):