fix for mobi files which put style tags outside of paragraph tags, which upsets lxml and BeautifulSoup

This commit is contained in:
ldolse 2010-11-22 18:35:44 +08:00
parent 3867874839
commit 333cb0652e

View File

@ -476,6 +476,9 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<?xml[^>]*>', '', self.processed_html)
# Rearrange various style tags & paragraph tags so that lxml and BeautifulSoup don't get upset
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(i|b|u|em|small|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(i|b|u|em|small|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',