From de6e5f5cc0a90898e0334c1e242d874a48e349a6 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Thu, 3 Feb 2011 21:57:52 -0500
Subject: [PATCH] FB2 Output: Fix bug with writing soft scene breaks based on
margin. TXT Output: Fix bug with writing soft scene breaks based on margin.
PML Input: Reduce number of empty lines needed to produce a soft scene break.
PML Output: Handle soft scene breaks based on empty paragraphs and margin.
Add handling of left margin. Correct writing of \c and \r tags. Sanitize text
so it is not mistaken for a PML code. General work to produce cleaner output.
---
src/calibre/ebooks/fb2/fb2ml.py | 2 +-
src/calibre/ebooks/pml/pmlconverter.py | 2 +-
src/calibre/ebooks/pml/pmlml.py | 93 ++++++++++++++++++++------
src/calibre/ebooks/txt/txtml.py | 2 +-
4 files changed, 77 insertions(+), 22 deletions(-)
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index dedfe963f6..6af058da7b 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -398,7 +398,7 @@ class FB2MLizer(object):
tags += p_tag
fb2_out.append('' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
if tag in ('br', 'hr') or ems:
- if not ems:
+ if ems < 1:
multiplier = 1
else:
multiplier = ems
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 20d8c7186b..7d1e74e3f4 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -603,7 +603,7 @@ class PML_HTMLizer(object):
if empty:
empty_count += 1
- if empty_count == 3:
+ if empty_count == 2:
output.append('
')
else:
empty_count = 0
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index ceb7f36124..582854bc69 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -10,10 +10,13 @@ Transform OEB content into PML markup
import re
+from lxml import etree
+
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.pml import unipmlcode
+from calibre.utils.cleantext import clean_ascii_chars
TAG_MAP = {
'b' : 'B',
@@ -64,8 +67,8 @@ SEPARATE_TAGS = [
'h4',
'h5',
'h6',
- 'p',
- 'div',
+ 'hr',
+ 'img',
'li',
'tr',
]
@@ -122,9 +125,12 @@ class PMLMLizer(object):
text = [u'']
for item in self.oeb_book.spine:
self.log.debug('Converting %s to PML markup...' % item.href)
- stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
+ content = unicode(etree.tostring(item.data, encoding=unicode))
+ content = self.prepare_text(content)
+ content = etree.fromstring(content)
+ stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append(self.add_page_anchor(item))
- text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+ text += self.dump_text(content.find(XHTML('body')), stylizer, item)
return ''.join(text)
def add_page_anchor(self, page):
@@ -147,6 +153,21 @@ class PMLMLizer(object):
text = text.replace('\r', ' ')
return text
+ def prepare_string_for_pml(self, text):
+ text = self.remove_newlines(text)
+ # Replace \ with \\ so \ in the text is not interperted as
+ # a pml code.
+ text = text.replace('\\', '\\\\')
+ # Replace sequences of \\c \\c with pml sequences denoting
+ # empty lines.
+ text = text.replace('\\\\c \\\\c', '\\c \n\\c\n')
+ return text
+
+ def prepare_text(self, text):
+ # Replace empty paragraphs with \c pml codes used to denote emtpy lines.
+ text = re.sub(ur'(?<=
)\s*]*>[\xc2\xa0\s]*
', '\\c\n\\c', text)
+ return text
+
def clean_text(self, text):
# Remove excessive \p tags
text = re.sub(r'\\p\s*\\p', '', text)
@@ -171,16 +192,19 @@ class PMLMLizer(object):
# Remove excessive spaces
text = re.sub('[ ]{2,}', ' ', text)
+
+ # Condense excessive \c empty line sequences.
+ text = re.sub('(\\c\s*\\c\s*){2,}', '\\c \n\\c\n', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
- text = re.sub('(?imu)^(?P.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCm]', mo.group('text')) else ' %s' % mo.group('text'), text)
+ # Only indent lines that don't have special formatting
+ text = re.sub('(?imu)^(?P.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCmrctTp]', mo.group('text')) else ' %s' % mo.group('text'), text)
else:
text = re.sub('\n{3,}', '\n\n', text)
-
return text
def dump_text(self, elem, stylizer, page, tag_stack=[]):
@@ -203,7 +227,7 @@ class PMLMLizer(object):
tags.append('block')
# Process tags that need special processing and that do not have inner
- # text. Usually these require an argument
+ # text. Usually these require an argument.
if tag in IMAGE_TAGS:
if elem.attrib.get('src', None):
if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
@@ -212,7 +236,7 @@ class PMLMLizer(object):
else:
self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00')
text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])])
- if tag == 'hr':
+ elif tag == 'hr':
w = '\\w'
width = elem.get('width')
if width:
@@ -222,6 +246,10 @@ class PMLMLizer(object):
else:
w += '="50%"'
text.append(w)
+ elif tag == 'br':
+ text.append('\n\\c \n\\c\n')
+
+ # TOC markers.
toc_name = elem.attrib.get('name', None)
toc_id = elem.attrib.get('id', None)
if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6',):
@@ -234,9 +262,10 @@ class PMLMLizer(object):
# Process style information that needs holds a single tag
# Commented out because every page in an OEB book starts with this style
- #if style['page-break-before'] == 'always':
- # text.append('\\p')
+ if style['page-break-before'] == 'always':
+ text.append('\\p')
+ # Process basic PML tags.
pml_tag = TAG_MAP.get(tag, None)
if pml_tag and pml_tag not in tag_stack+tags:
text.append('\\%s' % pml_tag)
@@ -270,34 +299,60 @@ class PMLMLizer(object):
if style_tag and style_tag not in tag_stack+tags:
text.append('\\%s' % style_tag)
tags.append(style_tag)
- # margin
+
+ # margin left
+ try:
+ mms = int(float(style['margin-left']) * 100 / style.height)
+ if mms:
+ text.append('\\T="%s%%"' % mms)
+ except:
+ pass
+
+ # Soft scene breaks.
+ try:
+ ems = int(round((float(style.marginTop) / style.fontSize) - 1))
+ if ems >= 1:
+ text.append('\n\\c \n\\c\n')
+ except:
+ pass
- # Proccess tags that contain text.
+ # Proccess text within this tag.
if hasattr(elem, 'text') and elem.text:
- text.append(self.remove_newlines(elem.text))
+ text.append(self.prepare_string_for_pml(elem.text))
+ # Process inner tags
for item in elem:
text += self.dump_text(item, stylizer, page, tag_stack+tags)
+ # Close opened tags.
tags.reverse()
text += self.close_tags(tags)
- if tag in SEPARATE_TAGS:
- text.append('\n\n')
+ #if tag in SEPARATE_TAGS:
+ # text.append('\n\n')
- #if style['page-break-after'] == 'always':
- # text.append('\\p')
+ if style['page-break-after'] == 'always':
+ text.append('\\p')
+ # Process text after this tag but not within another.
if hasattr(elem, 'tail') and elem.tail:
- text.append(self.remove_newlines(elem.tail))
+ text.append(self.prepare_string_for_pml(elem.tail))
return text
def close_tags(self, tags):
text = []
for tag in tags:
+ # block isn't a real tag we just use
+ # it to determine when we need to start
+ # a new text block.
if tag == 'block':
text.append('\n\n')
else:
- text.append('\\%s' % tag)
+ # closing \c and \r need to be placed
+ # on the next line per PML spec.
+ if tag in ('c', 'r'):
+ text.append('\n\\%s' % tag)
+ else:
+ text.append('\\%s' % tag)
return text
diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index 6654e70475..c2ee3f37c5 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -226,7 +226,7 @@ class TXTMLizer(object):
# Soft scene breaks.
try:
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
- if ems:
+ if ems >= 1:
text.append('\n' * ems)
except:
pass