From da29a58363f770f38f01e02e3cb4221331666c0a Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Fri, 11 Jul 2008 14:37:27 -0400 Subject: [PATCH] Integrated own cleanup patch --- src/calibre/ebooks/mobi/reader.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index dea87dbd8c..05093f3c1a 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -13,7 +13,7 @@ except ImportError: import Image as PILImage from calibre import __appname__ -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.mobi.palmdoc import decompress_doc @@ -165,13 +165,14 @@ class MobiReader(object): self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.extract_images(processed_records, output_dir) self.replace_page_breaks() - self.cleanup() + self.cleanup_html() self.processed_html = re.compile('', re.IGNORECASE).sub( '\n\n', self.processed_html) soup = BeautifulSoup(self.processed_html.replace('> <', '>\n<')) + self.cleanup_soup(soup) guide = soup.find('guide') for elem in soup.findAll(['metadata', 'guide']): elem.extract() @@ -192,10 +193,29 @@ class MobiReader(object): if ncx: open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) - def cleanup(self): + def cleanup_html(self): self.processed_html = re.sub(r'
', '', self.processed_html) - self.processed_html = re.sub(r'<([^>]*) height="([^"]*)"', r'<\1 style="margin-top: \2"', self.processed_html) - self.processed_html = re.sub(r'<([^>]*) width="([^"]*)"', r'<\1 style="text-indent: \2"', self.processed_html) + + def cleanup_soup(self, soup): + for tag in soup.recursiveChildGenerator(): + if not isinstance(tag, Tag): continue + styles = [] + try: + styles.append(tag['style']) + except KeyError: + pass + try: + styles.append('margin-top: %s' % tag['height']) + del tag['height'] + except KeyError: + pass + try: + styles.append('text-indent: %s' % tag['width']) + del tag['width'] + except KeyError: + pass + if styles: + tag['style'] = '; '.join(styles) def create_opf(self, htmlfile, guide=None): mi = self.book_header.exth.mi