mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Much better html to pml parser, now accounts for style information and produces output that looks more like the input.
This commit is contained in:
parent
d16b8274c6
commit
1f0932ad40
@ -16,7 +16,7 @@ from calibre.ebooks.pdb.formatwriter import FormatWriter
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
from calibre.ebooks.pdb.header import PdbHeaderBuilder
|
||||
from calibre.ebooks.pdb.ereader import image_name
|
||||
from calibre.ebooks.pml.pmlconverter import html_to_pml
|
||||
from calibre.ebooks.pml.pmlml import PMLMLizer
|
||||
|
||||
IDENTITY = 'PNRdPPrs'
|
||||
|
||||
@ -31,7 +31,7 @@ class Writer(FormatWriter):
|
||||
self.log = log
|
||||
|
||||
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||
text = self._text(oeb_book.spine)
|
||||
text = self._text(oeb_book)
|
||||
images = self._images(oeb_book.manifest)
|
||||
metadata = [self._metadata(metadata)]
|
||||
|
||||
@ -41,16 +41,15 @@ class Writer(FormatWriter):
|
||||
|
||||
lengths = [len(i) for i in sections]
|
||||
|
||||
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '')
|
||||
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0])
|
||||
pdbHeaderBuilder.build_header(lengths, out_stream)
|
||||
|
||||
for item in sections:
|
||||
out_stream.write(item)
|
||||
|
||||
def _text(self, pages):
|
||||
pml = ''
|
||||
for page in pages:
|
||||
pml += html_to_pml(unicode(page)).encode('cp1252')
|
||||
def _text(self, oeb_book):
|
||||
pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
|
||||
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252')
|
||||
|
||||
pml_pages = []
|
||||
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
|
||||
|
@ -12,7 +12,7 @@ from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
from calibre.ebooks.pml.pmlconverter import html_to_pml
|
||||
from calibre.ebooks.pml.pmlml import PMLMLizer
|
||||
|
||||
class PMLOutput(OutputFormatPlugin):
|
||||
|
||||
@ -22,22 +22,16 @@ class PMLOutput(OutputFormatPlugin):
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
with TemporaryDirectory('_pmlz_output') as tdir:
|
||||
self.process_spine(oeb_book.spine, tdir)
|
||||
pmlmlizer = PMLMLizer(ignore_tables=opts.linearize_tables)
|
||||
content = pmlmlizer.extract_content(oeb_book, opts)
|
||||
with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
|
||||
out.write(content.encode('utf-8'))
|
||||
|
||||
self.write_images(oeb_book.manifest, tdir)
|
||||
|
||||
pmlz = ZipFile(output_path, 'w')
|
||||
pmlz.add_dir(tdir)
|
||||
|
||||
def process_spine(self, spine, out_dir):
|
||||
for item in spine:
|
||||
html = html_to_pml(unicode(item)).encode('utf-8')
|
||||
|
||||
name = os.path.splitext(os.path.basename(item.href))[0] + '.pml'
|
||||
path = os.path.join(out_dir, name)
|
||||
|
||||
with open(path, 'wb') as out:
|
||||
out.write(html)
|
||||
|
||||
def write_images(self, manifest, out_dir):
|
||||
for item in manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
|
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.ebooks.pdb.ereader import image_name
|
||||
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
||||
|
||||
@ -67,75 +66,6 @@ PML_HTML_RULES = [
|
||||
(re.compile(r'\\\\'), lambda match: '\\'),
|
||||
]
|
||||
|
||||
HTML_PML_RULES = [
|
||||
|
||||
(re.compile(r'\\'), lambda match: '\\\\'),
|
||||
(re.compile('(?<=[^\n])[ ]*<p.*?>'), lambda match: '\n<p>'),
|
||||
(re.compile('</p>(?=^\n|^\r\n)'), lambda match: '\n'),
|
||||
|
||||
|
||||
# Clean up HTML
|
||||
(re.compile('@page.*?}'), lambda match: ''),
|
||||
(re.compile('<script.*?>.*?</script>', re.DOTALL), lambda match: ''),
|
||||
(re.compile('<style.*?>.*?</style>', re.DOTALL), lambda match: ''),
|
||||
|
||||
# Reflow paragraphs
|
||||
(re.compile('<p.*?>(?P<text>.*?)</p>', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')),
|
||||
|
||||
# HTML to PML
|
||||
(re.compile('<a.*?href="#sidebar-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))),
|
||||
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
|
||||
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
||||
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
||||
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')),
|
||||
(re.compile('&(?P<num>#\d+);'), lambda match: entity_to_unicode(match)),
|
||||
(re.compile('&(?P<num>.+);'), lambda match: entity_to_unicode(match)),
|
||||
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||
(re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||
(re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
||||
(re.compile('<sub>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
||||
(re.compile('<sup .*?>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
||||
(re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
||||
(re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||
(re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||
(re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||
(re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||
(re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
||||
(re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
||||
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
|
||||
(re.compile('<div.*?style.*?margin-left: (?P<val>\d+)%*;.*?>(?P<text>.+?)</div>', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))),
|
||||
(re.compile('<div.*?style.*?margin-left: \d{1,3}%;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')),
|
||||
(re.compile('<!--(?P<text>.+?)-->', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')),
|
||||
(re.compile('<del .*?>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
|
||||
(re.compile('<del>(?P<text>.+?)</del>', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')),
|
||||
(re.compile('<div.*?style.*?text-decoration: underline;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')),
|
||||
(re.compile('<i .*?>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
|
||||
(re.compile('<i>(?P<text>.+?)</i>', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')),
|
||||
(re.compile('<div.*?style.*?text-align: right;.*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')),
|
||||
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
|
||||
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
||||
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
|
||||
(re.compile('<br .*?>'), lambda match: '\n'),
|
||||
(re.compile('<br/*>'), lambda match: '\n'),
|
||||
|
||||
# Remove remaining HTML tags
|
||||
(re.compile('<.*?>'), lambda match: ''),
|
||||
|
||||
# Remove redundant page break markers
|
||||
(re.compile(r'(\\p){2,}'), lambda match: r'\p'),
|
||||
|
||||
# Remove whitespace on empty lines
|
||||
(re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
|
||||
# Remove excess whitespace in lines
|
||||
(re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
|
||||
|
||||
# Remove excess newlines at the beginning and end
|
||||
(re.compile('^(\r\n){1,}'), lambda match: ''),
|
||||
(re.compile('^\n{1,}'), lambda match: ''),
|
||||
(re.compile('(\r\n){3,}$'), lambda match: ''),
|
||||
(re.compile('\n{3,}$'), lambda match: ''),
|
||||
]
|
||||
|
||||
def pml_to_html(pml):
|
||||
html = pml
|
||||
for rule in PML_HTML_RULES:
|
||||
@ -151,15 +81,3 @@ def footnote_sidebar_to_html(id, pml):
|
||||
html = '<div id="sidebar-%s"><dt>%s</dt></div><dd>%s</dd>' % (id, id, pml_to_html(pml))
|
||||
return html
|
||||
|
||||
def html_to_pml(html):
|
||||
pml = ''
|
||||
|
||||
for dom_tree in BeautifulSoup(html).findAll('body'):
|
||||
body = unicode(dom_tree.prettify())
|
||||
|
||||
for rule in HTML_PML_RULES:
|
||||
body = rule[0].sub(rule[1], body)
|
||||
|
||||
pml += body
|
||||
|
||||
return pml
|
||||
|
178
src/calibre/ebooks/pml/pmlml.py
Normal file
178
src/calibre/ebooks/pml/pmlml.py
Normal file
@ -0,0 +1,178 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into PML markup
|
||||
'''
|
||||
|
||||
import os, re
|
||||
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.pdb.ereader import image_name
|
||||
|
||||
TAG_MAP = {
|
||||
'b' : 'B',
|
||||
'strong' : 'B',
|
||||
'i' : 'I',
|
||||
'small' : 'k',
|
||||
'sub' : 'Sb',
|
||||
'sup' : 'Sp',
|
||||
'big' : 'l',
|
||||
'del' : 'o',
|
||||
'h1' : 'x',
|
||||
'h2' : 'x0',
|
||||
'h3' : 'x1',
|
||||
'h4' : 'x2',
|
||||
'h5' : 'x3',
|
||||
'h6' : 'x4',
|
||||
'!--' : 'v',
|
||||
}
|
||||
|
||||
STYLES = [
|
||||
('font-weight', {'bold' : 'B', 'bolder' : 'B'}),
|
||||
('font-style', {'italic' : 'I'}),
|
||||
('text-decoration', {'underline' : 'u'}),
|
||||
('text-align', {'right' : 'r', 'center' : 'c'}),
|
||||
]
|
||||
|
||||
class PMLMLizer(object):
|
||||
def __init__(self, ignore_tables=False):
|
||||
self.ignore_tables = ignore_tables
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
oeb_book.logger.info('Converting XHTML to PML markup...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
return self.pmlmlize_spine()
|
||||
|
||||
def pmlmlize_spine(self):
|
||||
output = u''
|
||||
for item in self.oeb_book.spine:
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
output = self.clean_text(output)
|
||||
|
||||
output = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
|
||||
output = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output)
|
||||
output = re.sub('[ ]{2,}', ' ', output)
|
||||
|
||||
return output
|
||||
|
||||
def clean_text(self, text):
|
||||
return text
|
||||
|
||||
def dump_text(self, elem, stylizer, tag_stack=[]):
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
return u''
|
||||
|
||||
text = u''
|
||||
style = stylizer.style(elem)
|
||||
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
return u''
|
||||
|
||||
tag = barename(elem.tag)
|
||||
tag_count = 0
|
||||
|
||||
# Are we in a paragraph block?
|
||||
if tag == 'p' or style['display'] in ('block'):
|
||||
if 'block' not in tag_stack:
|
||||
tag_count += 1
|
||||
tag_stack.append('block')
|
||||
|
||||
# Process tags that need special processing and that do not have inner
|
||||
# text. Usually these require an argument
|
||||
if tag == 'img':
|
||||
text += '\\m="%s"' % image_name(os.path.basename(elem.get('src'))).strip('\x00')
|
||||
if tag == 'hr':
|
||||
text += '\\w'
|
||||
width = elem.get('width')
|
||||
if width:
|
||||
text += '="%s%"' % width
|
||||
else:
|
||||
text += '="50%"'
|
||||
|
||||
# Process style information that needs holds a single tag
|
||||
if style['page-break-before'] == 'always':
|
||||
text += '\\p'
|
||||
if style['page-break-after'] == 'always':
|
||||
text += '\\p'
|
||||
|
||||
# Proccess tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
|
||||
pml_tag = TAG_MAP.get(tag, None)
|
||||
if pml_tag and pml_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '\\%s' % pml_tag
|
||||
tag_stack.append(pml_tag)
|
||||
|
||||
# Special processing of tags that require an argument.
|
||||
# Anchors links
|
||||
if tag == 'a' and 'q' not in tag_stack:
|
||||
href = elem.get('href')
|
||||
if href and href.startswith('#'):
|
||||
tag_count += 1
|
||||
text += '\\q="%s"' % href
|
||||
tag_stack.append('q')
|
||||
# Anchor ids
|
||||
id_name = elem.get('id')
|
||||
if id_name:
|
||||
text += '\\Q="%s"' % id_name
|
||||
|
||||
# Processes style information
|
||||
for s in STYLES:
|
||||
style_tag = s[1].get(style[s[0]], None)
|
||||
if style_tag and style_tag not in tag_stack:
|
||||
tag_count += 1
|
||||
text += '\\%s' % style_tag
|
||||
tag_stack.append(style_tag)
|
||||
# margin
|
||||
|
||||
text += self.elem_text(elem, tag_stack)
|
||||
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer, tag_stack)
|
||||
|
||||
close_tag_list = []
|
||||
for i in range(0, tag_count):
|
||||
close_tag_list.insert(0, tag_stack.pop())
|
||||
text += self.close_tags(close_tag_list)
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'):
|
||||
text += os.linesep + os.linesep
|
||||
|
||||
|
||||
if 'block' not in tag_stack:
|
||||
text += os.linesep + os.linesep
|
||||
|
||||
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
|
||||
text += self.elem_tail(elem, tag_stack)
|
||||
|
||||
return text
|
||||
|
||||
def elem_text(self, elem, tag_stack):
|
||||
return self.block_text(elem.text, 'block' in tag_stack)
|
||||
|
||||
def elem_tail(self, elem, tag_stack):
|
||||
return self.block_text(elem.tail, 'block' in tag_stack)
|
||||
|
||||
def block_text(self, text, in_block):
|
||||
if in_block:
|
||||
text = text.replace('\n\r', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
return text
|
||||
|
||||
def close_tags(self, tags):
|
||||
text = u''
|
||||
for i in range(0, len(tags)):
|
||||
tag = tags.pop()
|
||||
if tag != 'block':
|
||||
text += '\\%s' % tag
|
||||
return text
|
||||
|
Loading…
x
Reference in New Issue
Block a user