This commit is contained in:
ldolse 2010-11-22 18:49:52 +08:00
parent 333cb0652e
commit 0013d4e059

View File

@ -477,8 +477,8 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<?xml[^>]*>', '', self.processed_html)
# Rearrange various style tags & paragraph tags so that lxml and BeautifulSoup don't get upset
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(i|b|u|em|small|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(i|b|u|em|small|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',