MOBI Input: Handle files that have spurious closing </body> and/or </html> tags in their markup. Fixes #925833 (prc file fails to read or convert)

This commit is contained in:
Kovid Goyal 2012-02-07 14:10:18 +05:30
parent 5f4f7b4c09
commit b56891cdae

View File

@ -516,6 +516,17 @@ class MobiReader(object):
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html) self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\g<para>'+'\g<blockquote>', self.processed_html) self.processed_html = re.sub(r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\g<para>'+'\g<blockquote>', self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', '\g<blockquote>'+'\g<para>', self.processed_html) self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', '\g<blockquote>'+'\g<para>', self.processed_html)
bods = htmls = 0
for x in re.finditer(ur'</body>|</html>', self.processed_html):
if x == '</body>': bods +=1
else: htmls += 1
if bods > 1 and htmls > 1:
break
if bods > 1:
self.processed_html = self.processed_html.replace('</body>', '')
if htmls > 1:
self.processed_html = self.processed_html.replace('</html>', '')
def remove_random_bytes(self, html): def remove_random_bytes(self, html):