mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PML to HTML conversion fixes.
This commit is contained in:
parent
f10852a43c
commit
2bd8cb9059
@ -114,6 +114,18 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
||||||
|
|
||||||
|
# Fix French accents
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
|
||||||
|
(re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
|
||||||
|
|
||||||
# Remove page links
|
# Remove page links
|
||||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||||
# Remove <hr> tags
|
# Remove <hr> tags
|
||||||
|
@ -19,8 +19,6 @@ from calibre.ebooks.compression.palmdoc import decompress_doc
|
|||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.pdb.ereader import EreaderError
|
from calibre.ebooks.pdb.ereader import EreaderError
|
||||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
|
|
||||||
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
|
||||||
|
|
||||||
class HeaderRecord(object):
|
class HeaderRecord(object):
|
||||||
'''
|
'''
|
||||||
@ -99,16 +97,21 @@ class Reader132(FormatReader):
|
|||||||
return self.decompress_text(number)
|
return self.decompress_text(number)
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
|
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
|
||||||
|
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
||||||
|
|
||||||
output_dir = os.path.abspath(output_dir)
|
output_dir = os.path.abspath(output_dir)
|
||||||
|
|
||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
html = u'<html><head><title></title></head><body>'
|
html = u'<html><head><title>%s</title></head><body>' % self.mi.title
|
||||||
|
|
||||||
|
pml = u''
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
self.log.debug('Extracting text page %i' % i)
|
self.log.debug('Extracting text page %i' % i)
|
||||||
html += pml_to_html(self.get_text_page(i))
|
pml += self.get_text_page(i)
|
||||||
|
html += pml_to_html(pml)
|
||||||
|
|
||||||
if self.header_record.footnote_rec > 0:
|
if self.header_record.footnote_rec > 0:
|
||||||
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
||||||
|
@ -12,7 +12,6 @@ import struct
|
|||||||
|
|
||||||
from calibre import CurrentDir
|
from calibre import CurrentDir
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
|
||||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
from calibre.ebooks.pdb.ereader import EreaderError
|
from calibre.ebooks.pdb.ereader import EreaderError
|
||||||
@ -81,19 +80,20 @@ class Reader202(FormatReader):
|
|||||||
return self.decompress_text(number)
|
return self.decompress_text(number)
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
|
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
||||||
|
|
||||||
output_dir = os.path.abspath(output_dir)
|
output_dir = os.path.abspath(output_dir)
|
||||||
|
|
||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
html = u'<html><head><title></title></head><body>'
|
pml = u''
|
||||||
|
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
self.log.debug('Extracting text page %i' % i)
|
self.log.debug('Extracting text page %i' % i)
|
||||||
html += pml_to_html(self.get_text_page(i))
|
pml += self.get_text_page(i)
|
||||||
|
|
||||||
|
html = u'<html><head><title>%s</title></head><body>%s</body></html>' % \
|
||||||
html += '</body></html>'
|
(self.mi.title, pml_to_html(pml))
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir):
|
||||||
with open('index.html', 'wb') as index:
|
with open('index.html', 'wb') as index:
|
||||||
|
@ -15,44 +15,47 @@ from calibre.ebooks.pdb.ereader import image_name
|
|||||||
|
|
||||||
PML_HTML_RULES = [
|
PML_HTML_RULES = [
|
||||||
(re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
|
(re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
|
||||||
(re.compile(r'\\x(?P<text>.+?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text')),
|
(re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.+?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
(re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s style="page-break-before: always;">%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
|
||||||
(re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
|
(re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
|
||||||
(re.compile(r'\\c(?P<text>.+?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text')),
|
(re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\r(?P<text>.+?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text')),
|
(re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\i(?P<text>.+?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text')),
|
(re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\u(?P<text>.+?)\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text')),
|
(re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\o(?P<text>.+?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
|
(re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\v(?P<text>.+?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
|
(re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\t(?P<text>.+?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text')),
|
(re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.+?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%;">%s</div>' % (match.group('val'), match.group('text'))),
|
(re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%;">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''),
|
||||||
(re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
|
(re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
|
||||||
(re.compile(r'\\n'), lambda match: ''),
|
(re.compile(r'\\n'), lambda match: ''),
|
||||||
(re.compile(r'\\s'), lambda match: ''),
|
(re.compile(r'\\s'), lambda match: ''),
|
||||||
(re.compile(r'\\b(?P<text>.+?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
|
(re.compile(r'\\b(?P<text>.*?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead.
|
||||||
(re.compile(r'\\l(?P<text>.+?)\\l', re.DOTALL), lambda match: '<big>%s</big>' % match.group('text')),
|
(re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span style="font-size: 175%%">%s</span>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\B(?P<text>.+?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')),
|
(re.compile(r'\\B(?P<text>.*?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\Sp(?P<text>.+?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text')),
|
(re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\Sb(?P<text>.+?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text')),
|
(re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\k(?P<text>.+?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
|
(re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text') if match.group('text') else ''),
|
||||||
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
|
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
|
||||||
(re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
|
(re.compile(r'\\U(?P<num>\d\d\d\d)'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
|
||||||
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
|
||||||
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
||||||
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
|
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
|
||||||
(re.compile(r'\\-'), lambda match: ''),
|
(re.compile(r'\\-'), lambda match: ''),
|
||||||
(re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.*?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
||||||
(re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.*?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
||||||
(re.compile(r'\\I'), lambda match: ''),
|
(re.compile(r'\\I'), lambda match: ''),
|
||||||
|
|
||||||
# Sidebar and Footnotes
|
# Sidebar and Footnotes
|
||||||
(re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.+?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
||||||
(re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.+?)\s*</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>' % (match.group('target'), match.group('text'))),
|
(re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
|
||||||
|
|
||||||
# eReader files are one paragraph per line.
|
# eReader files are one paragraph per line.
|
||||||
# This forces the lines to wrap properly.
|
# This forces the lines to wrap properly.
|
||||||
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
|
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
|
||||||
|
# Remove empty <p>'s.
|
||||||
(re.compile('<p>[ ]*</p>'), lambda match: ''),
|
(re.compile('<p>[ ]*</p>'), lambda match: ''),
|
||||||
|
# Ensure empty lines carry over.
|
||||||
|
(re.compile('^$', re.MULTILINE), lambda match: '<br />'),
|
||||||
|
|
||||||
# Remove unmatched plm codes.
|
# Remove unmatched plm codes.
|
||||||
(re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
|
(re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user