diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py
index 270c8a7b0f..f2d00742ba 100644
--- a/src/calibre/ebooks/pml/input.py
+++ b/src/calibre/ebooks/pml/input.py
@@ -42,9 +42,23 @@ class PMLInput(InputFormatPlugin):
if self.options.input_encoding:
ienc = self.options.input_encoding
+ style = '''
+
+'''
self.log.debug('Converting PML to HTML...')
html = pml_to_html(pml_stream.read().decode(ienc))
- html_stream.write('
' + html.encode('utf-8', 'replace') + '')
+ html_stream.write('%s' % style)
+ html_stream.write(html.encode('utf-8', 'replace'))
+ html_stream.write('')
if pclose:
pml_stream.close()
@@ -79,7 +93,7 @@ class PMLInput(InputFormatPlugin):
pimg_name = os.path.basename(img)
pimg_path = os.path.join(os.getcwd(), 'images', pimg_name)
- images.append(pimg_name)
+ images.append('images/' + pimg_name)
shutil.move(img, pimg_path)
else:
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index dafe1e4f6a..1b42f99cc1 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -14,27 +14,38 @@ from calibre import my_unichr
from calibre.ebooks.pdb.ereader import image_name
PML_HTML_RULES = [
- (re.compile(r'\\p'), lambda match: '
'),
- (re.compile(r'\\x(?P.*?)\\x', re.DOTALL), lambda match: '%s
' % match.group('text') if match.group('text') else ''),
- (re.compile(r'\\X(?P[0-4])(?P.*?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
+ # Any literal <, &, and > chars be escaped to avoid HTML issues (though
+ # and tags are handled specially later).
+ (re.compile(r'&'), lambda match: '&'),
+ (re.compile(r'<'), lambda match: '<'),
+ (re.compile(r'>'), lambda match: '>'),
+
+ # NOP-process all \x escapes, turning \\ into \ This keeps the regex
+ # parsing simple while making sure that we don't try to honor \\x as \x
+ # (and also makes sure we DO honor \\\x as \ followed by \x).
+ (re.compile(r'\\(.)'), lambda match: '\' if match.group(1) == '\\' else '\\' + match.group(1)),
+
+ (re.compile(r'\\p'), lambda match: '
'),
+ (re.compile(r'\\x(?P.*?)\\x', re.DOTALL), lambda match: '%s
' % match.group('text') if match.group('text') else ''),
+ (re.compile(r'\\X(?P[0-4])(?P.*?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
(re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
- (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
- (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
+ (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '%s
' % match.group('text') if match.group('text') else ''),
+ (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '%s
' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\i(?P.*?)\\i', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
- (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
+ (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\o(?P.*?)\\o', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\v(?P.*?)\\v', re.DOTALL), lambda match: '' % match.group('text') if match.group('text') else ''),
- (re.compile(r'\\t(?P.*?)\\t', re.DOTALL), lambda match: '%s
' % match.group('text') if match.group('text') else ''),
- (re.compile(r'\\T="(?P\d+)%*"(?P.*?)$', re.MULTILINE), lambda match: r'%s
' % (match.group('val'), match.group('text')) if match.group('text') else ''),
- (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')),
+ (re.compile(r'\\t(?P.*?)\\t', re.DOTALL), lambda match: '%s
' % match.group('text') if match.group('text') else ''),
+ (re.compile(r'\\T="(?P\d+)%*"(?P.*?)$', re.MULTILINE), lambda match: r'%s
' % (match.group('val'), match.group('text')) if match.group('text') else ''),
+ (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')),
(re.compile(r'\\n'), lambda match: ''),
- (re.compile(r'\\s(?P.*?)\\s', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
+ (re.compile(r'\\s(?P.*?)\\s', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\b(?P.*?)\\b', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead.
- (re.compile(r'\\l(?P.*?)\\l', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
+ (re.compile(r'\\l(?P.*?)\\l', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\B(?P.*?)\\B', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\Sp(?P.*?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
(re.compile(r'\\Sb(?P.*?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
- (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text').upper() if match.group('text') else ''),
+ (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text').upper() if match.group('text') else ''),
(re.compile(r'\\a(?P\d{3})'), lambda match: '%s;' % match.group('num')),
(re.compile(r'\\U(?P[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P.+?)"'), lambda match: '
' % image_name(match.group('name')).strip('\x00')),
@@ -47,8 +58,8 @@ PML_HTML_RULES = [
(re.compile(r'\\I(?P.*?)\\I', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''),
# Sidebar and Footnotes
- (re.compile(r'.+?)">\s*(?P.*?)\s*', re.DOTALL), lambda match: '%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''),
- (re.compile(r'.+?)">\s*(?P.*?)\s*', re.DOTALL), lambda match: '%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''),
+ (re.compile(r'<sidebar\s+id="(?P.+?)">\s*(?P.*?)\s*</sidebar>', re.DOTALL), lambda match: '%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''),
+ (re.compile(r'<footnote\s+id="(?P.+?)">\s*(?P.*?)\s*</footnote>', re.DOTALL), lambda match: '%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''),
# eReader files are one paragraph per line.
# This forces the lines to wrap properly.
@@ -58,16 +69,17 @@ PML_HTML_RULES = [
# Ensure empty lines carry over.
(re.compile('(\r\n|\n|\r){3}'), lambda match: '
'),
- # Remove unmatched plm codes.
- (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
- (re.compile(r'(?<=[^\\])\\X[0-4]'), lambda match: ''),
- (re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''),
- (re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''),
- # Remove invalid single item pml codes.
- (re.compile(r'(?<=[^\\])\\[^\\]'), lambda match: ''),
+ # Try to fix some of the misordering of character-attribute tags.
+ (re.compile(r'(?P(<(i|u|b|del|sup|sub)( [^>]+)?>)+)(?P((div|span)>)+)'), lambda match: match.group('close') + match.group('ch')),
+ (re.compile(r'(?P(<(i|u|b|del|sup|sub|span)( [^>]+)?>)+)(?P(<(div|h\d)( [^>]+)?>)+)'), lambda match: match.group('blk') + match.group('ch')),
- # Replace \\ with \.
- (re.compile(r'\\\\'), lambda match: '\\'),
+ # Remove unmatched plm codes.
+ (re.compile(r'\\X[0-4]'), lambda match: ''),
+ (re.compile(r'\\T="\d+%*"'), lambda match: ''),
+ (re.compile(r'\\Sp'), lambda match: ''),
+ (re.compile(r'\\Sb'), lambda match: ''),
+ # Remove invalid single item pml codes.
+ (re.compile(r'\\.'), lambda match: ''),
]
def pml_to_html(pml):