From de6e5f5cc0a90898e0334c1e242d874a48e349a6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 3 Feb 2011 21:57:52 -0500 Subject: [PATCH] FB2 Output: Fix bug with writing soft scene breaks based on margin. TXT Output: Fix bug with writing soft scene breaks based on margin. PML Input: Reduce number of empty lines needed to produce a soft scene break. PML Output: Handle soft scene breaks based on empty paragraphs and margin. Add handling of left margin. Correct writing of \c and \r tags. Sanitize text so it is not mistaken for a PML code. General work to produce cleaner output. --- src/calibre/ebooks/fb2/fb2ml.py | 2 +- src/calibre/ebooks/pml/pmlconverter.py | 2 +- src/calibre/ebooks/pml/pmlml.py | 93 ++++++++++++++++++++------ src/calibre/ebooks/txt/txtml.py | 2 +- 4 files changed, 77 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index dedfe963f6..6af058da7b 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -398,7 +398,7 @@ class FB2MLizer(object): tags += p_tag fb2_out.append('' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])]) if tag in ('br', 'hr') or ems: - if not ems: + if ems < 1: multiplier = 1 else: multiplier = ems diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 20d8c7186b..7d1e74e3f4 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -603,7 +603,7 @@ class PML_HTMLizer(object): if empty: empty_count += 1 - if empty_count == 3: + if empty_count == 2: output.append('

 

') else: empty_count = 0 diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index ceb7f36124..582854bc69 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -10,10 +10,13 @@ Transform OEB content into PML markup import re +from lxml import etree + from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pml import unipmlcode +from calibre.utils.cleantext import clean_ascii_chars TAG_MAP = { 'b' : 'B', @@ -64,8 +67,8 @@ SEPARATE_TAGS = [ 'h4', 'h5', 'h6', - 'p', - 'div', + 'hr', + 'img', 'li', 'tr', ] @@ -122,9 +125,12 @@ class PMLMLizer(object): text = [u''] for item in self.oeb_book.spine: self.log.debug('Converting %s to PML markup...' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) + content = unicode(etree.tostring(item.data, encoding=unicode)) + content = self.prepare_text(content) + content = etree.fromstring(content) + stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) text.append(self.add_page_anchor(item)) - text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + text += self.dump_text(content.find(XHTML('body')), stylizer, item) return ''.join(text) def add_page_anchor(self, page): @@ -147,6 +153,21 @@ class PMLMLizer(object): text = text.replace('\r', ' ') return text + def prepare_string_for_pml(self, text): + text = self.remove_newlines(text) + # Replace \ with \\ so \ in the text is not interperted as + # a pml code. + text = text.replace('\\', '\\\\') + # Replace sequences of \\c \\c with pml sequences denoting + # empty lines. + text = text.replace('\\\\c \\\\c', '\\c \n\\c\n') + return text + + def prepare_text(self, text): + # Replace empty paragraphs with \c pml codes used to denote emtpy lines. + text = re.sub(ur'(?<=

)\s*]*>[\xc2\xa0\s]*

', '\\c\n\\c', text) + return text + def clean_text(self, text): # Remove excessive \p tags text = re.sub(r'\\p\s*\\p', '', text) @@ -171,16 +192,19 @@ class PMLMLizer(object): # Remove excessive spaces text = re.sub('[ ]{2,}', ' ', text) + + # Condense excessive \c empty line sequences. + text = re.sub('(\\c\s*\\c\s*){2,}', '\\c \n\\c\n', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) - text = re.sub('(?imu)^(?P.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCm]', mo.group('text')) else ' %s' % mo.group('text'), text) + # Only indent lines that don't have special formatting + text = re.sub('(?imu)^(?P.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text) else: text = re.sub('\n{3,}', '\n\n', text) - return text def dump_text(self, elem, stylizer, page, tag_stack=[]): @@ -203,7 +227,7 @@ class PMLMLizer(object): tags.append('block') # Process tags that need special processing and that do not have inner - # text. Usually these require an argument + # text. Usually these require an argument. if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): @@ -212,7 +236,7 @@ class PMLMLizer(object): else: self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) - if tag == 'hr': + elif tag == 'hr': w = '\\w' width = elem.get('width') if width: @@ -222,6 +246,10 @@ class PMLMLizer(object): else: w += '="50%"' text.append(w) + elif tag == 'br': + text.append('\n\\c \n\\c\n') + + # TOC markers. toc_name = elem.attrib.get('name', None) toc_id = elem.attrib.get('id', None) if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6',): @@ -234,9 +262,10 @@ class PMLMLizer(object): # Process style information that needs holds a single tag # Commented out because every page in an OEB book starts with this style - #if style['page-break-before'] == 'always': - # text.append('\\p') + if style['page-break-before'] == 'always': + text.append('\\p') + # Process basic PML tags. pml_tag = TAG_MAP.get(tag, None) if pml_tag and pml_tag not in tag_stack+tags: text.append('\\%s' % pml_tag) @@ -270,34 +299,60 @@ class PMLMLizer(object): if style_tag and style_tag not in tag_stack+tags: text.append('\\%s' % style_tag) tags.append(style_tag) - # margin + + # margin left + try: + mms = int(float(style['margin-left']) * 100 / style.height) + if mms: + text.append('\\T="%s%%"' % mms) + except: + pass + + # Soft scene breaks. + try: + ems = int(round((float(style.marginTop) / style.fontSize) - 1)) + if ems >= 1: + text.append('\n\\c \n\\c\n') + except: + pass - # Proccess tags that contain text. + # Proccess text within this tag. if hasattr(elem, 'text') and elem.text: - text.append(self.remove_newlines(elem.text)) + text.append(self.prepare_string_for_pml(elem.text)) + # Process inner tags for item in elem: text += self.dump_text(item, stylizer, page, tag_stack+tags) + # Close opened tags. tags.reverse() text += self.close_tags(tags) - if tag in SEPARATE_TAGS: - text.append('\n\n') + #if tag in SEPARATE_TAGS: + # text.append('\n\n') - #if style['page-break-after'] == 'always': - # text.append('\\p') + if style['page-break-after'] == 'always': + text.append('\\p') + # Process text after this tag but not within another. if hasattr(elem, 'tail') and elem.tail: - text.append(self.remove_newlines(elem.tail)) + text.append(self.prepare_string_for_pml(elem.tail)) return text def close_tags(self, tags): text = [] for tag in tags: + # block isn't a real tag we just use + # it to determine when we need to start + # a new text block. if tag == 'block': text.append('\n\n') else: - text.append('\\%s' % tag) + # closing \c and \r need to be placed + # on the next line per PML spec. + if tag in ('c', 'r'): + text.append('\n\\%s' % tag) + else: + text.append('\\%s' % tag) return text diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 6654e70475..c2ee3f37c5 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -226,7 +226,7 @@ class TXTMLizer(object): # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) - if ems: + if ems >= 1: text.append('\n' * ems) except: pass