diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py
index f49aa4e125..c99c75a929 100644
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@@ -16,7 +16,7 @@ from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.pdb.ereader import image_name
-from calibre.ebooks.pml.pmlconverter import html_to_pml
+from calibre.ebooks.pml.pmlml import PMLMLizer
IDENTITY = 'PNRdPPrs'
@@ -31,7 +31,7 @@ class Writer(FormatWriter):
self.log = log
def write_content(self, oeb_book, out_stream, metadata=None):
- text = self._text(oeb_book.spine)
+ text = self._text(oeb_book)
images = self._images(oeb_book.manifest)
metadata = [self._metadata(metadata)]
@@ -41,16 +41,15 @@ class Writer(FormatWriter):
lengths = [len(i) for i in sections]
- pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '')
+ pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0])
pdbHeaderBuilder.build_header(lengths, out_stream)
for item in sections:
out_stream.write(item)
- def _text(self, pages):
- pml = ''
- for page in pages:
- pml += html_to_pml(unicode(page)).encode('cp1252')
+ def _text(self, oeb_book):
+ pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
+ pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252')
pml_pages = []
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py
index c5fbc990af..9d07718654 100644
--- a/src/calibre/ebooks/pml/output.py
+++ b/src/calibre/ebooks/pml/output.py
@@ -12,7 +12,7 @@ from calibre.customize.conversion import OutputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.oeb.base import OEB_IMAGES
-from calibre.ebooks.pml.pmlconverter import html_to_pml
+from calibre.ebooks.pml.pmlml import PMLMLizer
class PMLOutput(OutputFormatPlugin):
@@ -22,22 +22,16 @@ class PMLOutput(OutputFormatPlugin):
def convert(self, oeb_book, output_path, input_plugin, opts, log):
with TemporaryDirectory('_pmlz_output') as tdir:
- self.process_spine(oeb_book.spine, tdir)
+ pmlmlizer = PMLMLizer(ignore_tables=opts.linearize_tables)
+ content = pmlmlizer.extract_content(oeb_book, opts)
+ with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
+ out.write(content.encode('utf-8'))
+
self.write_images(oeb_book.manifest, tdir)
pmlz = ZipFile(output_path, 'w')
pmlz.add_dir(tdir)
- def process_spine(self, spine, out_dir):
- for item in spine:
- html = html_to_pml(unicode(item)).encode('utf-8')
-
- name = os.path.splitext(os.path.basename(item.href))[0] + '.pml'
- path = os.path.join(out_dir, name)
-
- with open(path, 'wb') as out:
- out.write(html)
-
def write_images(self, manifest, out_dir):
for item in manifest:
if item.media_type in OEB_IMAGES:
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index dded21c38c..0cd7da8e72 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
import re
-from calibre import entity_to_unicode
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
@@ -67,75 +66,6 @@ PML_HTML_RULES = [
(re.compile(r'\\\\'), lambda match: '\\'),
]
-HTML_PML_RULES = [
-
- (re.compile(r'\\'), lambda match: '\\\\'),
- (re.compile('(?<=[^\n])[ ]*
'), lambda match: '\n'),
- (re.compile('
(?=^\n|^\r\n)'), lambda match: '\n'),
-
-
- # Clean up HTML
- (re.compile('@page.*?}'), lambda match: ''),
- (re.compile('.*?', re.DOTALL), lambda match: ''),
- (re.compile('.*?', re.DOTALL), lambda match: ''),
-
- # Reflow paragraphs
- (re.compile('(?P.*?)', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')),
-
- # HTML to PML
- (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
- (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
- (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')),
- (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
- (re.compile('.+?)".*?>(.*?)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')),
- (re.compile('&(?P#\d+);'), lambda match: entity_to_unicode(match)),
- (re.compile('&(?P.+);'), lambda match: entity_to_unicode(match)),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
- (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
- (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
- (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
- (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
- (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
- (re.compile('
'), lambda match: '\n'),
- (re.compile('
'), lambda match: '\n'),
-
- # Remove remaining HTML tags
- (re.compile('<.*?>'), lambda match: ''),
-
- # Remove redundant page break markers
- (re.compile(r'(\\p){2,}'), lambda match: r'\p'),
-
- # Remove whitespace on empty lines
- (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
- # Remove excess whitespace in lines
- (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
-
- # Remove excess newlines at the beginning and end
- (re.compile('^(\r\n){1,}'), lambda match: ''),
- (re.compile('^\n{1,}'), lambda match: ''),
- (re.compile('(\r\n){3,}$'), lambda match: ''),
- (re.compile('\n{3,}$'), lambda match: ''),
-]
-
def pml_to_html(pml):
html = pml
for rule in PML_HTML_RULES:
@@ -151,15 +81,3 @@ def footnote_sidebar_to_html(id, pml):
html = '%s' % (id, id, pml_to_html(pml))
return html
-def html_to_pml(html):
- pml = ''
-
- for dom_tree in BeautifulSoup(html).findAll('body'):
- body = unicode(dom_tree.prettify())
-
- for rule in HTML_PML_RULES:
- body = rule[0].sub(rule[1], body)
-
- pml += body
-
- return pml
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
new file mode 100644
index 0000000000..a6febdc53f
--- /dev/null
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember '
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into PML markup
+'''
+
+import os, re
+
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
+from calibre.ebooks.oeb.stylizer import Stylizer
+from calibre.ebooks.pdb.ereader import image_name
+
+TAG_MAP = {
+ 'b' : 'B',
+ 'strong' : 'B',
+ 'i' : 'I',
+ 'small' : 'k',
+ 'sub' : 'Sb',
+ 'sup' : 'Sp',
+ 'big' : 'l',
+ 'del' : 'o',
+ 'h1' : 'x',
+ 'h2' : 'x0',
+ 'h3' : 'x1',
+ 'h4' : 'x2',
+ 'h5' : 'x3',
+ 'h6' : 'x4',
+ '!--' : 'v',
+}
+
+STYLES = [
+ ('font-weight', {'bold' : 'B', 'bolder' : 'B'}),
+ ('font-style', {'italic' : 'I'}),
+ ('text-decoration', {'underline' : 'u'}),
+ ('text-align', {'right' : 'r', 'center' : 'c'}),
+]
+
+class PMLMLizer(object):
+ def __init__(self, ignore_tables=False):
+ self.ignore_tables = ignore_tables
+
+ def extract_content(self, oeb_book, opts):
+ oeb_book.logger.info('Converting XHTML to PML markup...')
+ self.oeb_book = oeb_book
+ self.opts = opts
+ return self.pmlmlize_spine()
+
+ def pmlmlize_spine(self):
+ output = u''
+ for item in self.oeb_book.spine:
+ stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
+ output += self.dump_text(item.data.find(XHTML('body')), stylizer)
+ output = self.clean_text(output)
+
+ output = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
+ output = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
+ output = re.sub('[ ]{2,}', ' ', output)
+
+ return output
+
+ def clean_text(self, text):
+ return text
+
+ def dump_text(self, elem, stylizer, tag_stack=[]):
+ if not isinstance(elem.tag, basestring) \
+ or namespace(elem.tag) != XHTML_NS:
+ return u''
+
+ text = u''
+ style = stylizer.style(elem)
+
+ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+ or style['visibility'] == 'hidden':
+ return u''
+
+ tag = barename(elem.tag)
+ tag_count = 0
+
+ # Are we in a paragraph block?
+ if tag == 'p' or style['display'] in ('block'):
+ if 'block' not in tag_stack:
+ tag_count += 1
+ tag_stack.append('block')
+
+ # Process tags that need special processing and that do not have inner
+ # text. Usually these require an argument
+ if tag == 'img':
+ text += '\\m="%s"' % image_name(os.path.basename(elem.get('src'))).strip('\x00')
+ if tag == 'hr':
+ text += '\\w'
+ width = elem.get('width')
+ if width:
+ text += '="%s%"' % width
+ else:
+ text += '="50%"'
+
+ # Process style information that needs holds a single tag
+ if style['page-break-before'] == 'always':
+ text += '\\p'
+ if style['page-break-after'] == 'always':
+ text += '\\p'
+
+ # Proccess tags that contain text.
+ if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
+ pml_tag = TAG_MAP.get(tag, None)
+ if pml_tag and pml_tag not in tag_stack:
+ tag_count += 1
+ text += '\\%s' % pml_tag
+ tag_stack.append(pml_tag)
+
+ # Special processing of tags that require an argument.
+ # Anchors links
+ if tag == 'a' and 'q' not in tag_stack:
+ href = elem.get('href')
+ if href and href.startswith('#'):
+ tag_count += 1
+ text += '\\q="%s"' % href
+ tag_stack.append('q')
+ # Anchor ids
+ id_name = elem.get('id')
+ if id_name:
+ text += '\\Q="%s"' % id_name
+
+ # Processes style information
+ for s in STYLES:
+ style_tag = s[1].get(style[s[0]], None)
+ if style_tag and style_tag not in tag_stack:
+ tag_count += 1
+ text += '\\%s' % style_tag
+ tag_stack.append(style_tag)
+ # margin
+
+ text += self.elem_text(elem, tag_stack)
+
+ for item in elem:
+ text += self.dump_text(item, stylizer, tag_stack)
+
+ close_tag_list = []
+ for i in range(0, tag_count):
+ close_tag_list.insert(0, tag_stack.pop())
+ text += self.close_tags(close_tag_list)
+ if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'):
+ text += os.linesep + os.linesep
+
+
+ if 'block' not in tag_stack:
+ text += os.linesep + os.linesep
+
+ if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
+ text += self.elem_tail(elem, tag_stack)
+
+ return text
+
+ def elem_text(self, elem, tag_stack):
+ return self.block_text(elem.text, 'block' in tag_stack)
+
+ def elem_tail(self, elem, tag_stack):
+ return self.block_text(elem.tail, 'block' in tag_stack)
+
+ def block_text(self, text, in_block):
+ if in_block:
+ text = text.replace('\n\r', ' ')
+ text = text.replace('\n', ' ')
+ text = text.replace('\r', ' ')
+ return text
+
+ def close_tags(self, tags):
+ text = u''
+ for i in range(0, len(tags)):
+ tag = tags.pop()
+ if tag != 'block':
+ text += '\\%s' % tag
+ return text
+