diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 064751a878..d89b18475a 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -317,7 +317,13 @@ class MobiReader(object): if root.xpath('descendant::p/descendant::p'): from lxml.html import soupparser self.log.warning('Malformed markup, parsing using BeautifulSoup') - root = soupparser.fromstring(self.processed_html) + try: + root = soupparser.fromstring(self.processed_html) + except Exception, err: + self.log.warning('MOBI markup appears to contain random bytes. Stripping.') + self.processed_html = self.remove_random_bytes(self.processed_html) + root = soupparser.fromstring(self.processed_html) + if root.tag != 'html': self.log.warn('File does not have opening tag') @@ -457,7 +463,7 @@ class MobiReader(object): self.processed_html = self.processed_html.replace('' not in height and \ + re.search(r'\d+', height): styles.append('margin-top: %s' % self.ensure_unit(height)) if attrib.has_key('width'): width = attrib.pop('width').strip() - if width: + if width and re.search(r'\d+', width): styles.append('text-indent: %s' % self.ensure_unit(width)) if width.startswith('-'): styles.append('margin-left: %s' % self.ensure_unit(width[1:])) @@ -714,6 +721,9 @@ class MobiReader(object): self.processed_html += self.mobi_html[pos:end] + (anchor % oend) pos = end self.processed_html += self.mobi_html[pos:] + # Remove anchors placed inside entities + self.processed_html = re.sub(r'&([^;]*?)()([^;]*);', + r'&\1\3;\2', self.processed_html) def extract_images(self, processed_records, output_dir):