Fix pml parsing changes as they break PDB eReader input badly.

2025-11-18 12:33:03 -05:00 · 2009-11-14 20:01:10 -07:00 · 2009-11-14 20:01:10 -07:00 · d377b0f6e3
commit d377b0f6e3
parent cd242c2458
2 changed files with 9 additions and 23 deletions
--- a/src/calibre/ebooks/pml/input.py
+++ b/src/calibre/ebooks/pml/input.py
@ -43,23 +43,9 @@ class PMLInput(InputFormatPlugin):
        if self.options.input_encoding:
            ienc = self.options.input_encoding
        style = '''
 <style>
 .s {font-size: 1em}
 .l {font-size: 1.5em}
 .k {font-size: 0.75em}
 .c {text-align: center; margin: auto}
 .r {text-align: right}
 .t {margin-left: 5%}
 .p {page-break-after: always}
 .x {page-break-before: always}
 </style>
 '''
        self.log.debug('Converting PML to HTML...')
        html = pml_to_html(pml_stream.read().decode(ienc)) 
-        html_stream.write('<html><head><title />%s</head><body>' % style)
+        html_stream.write('<html><head><title /></head><body>%s</body></html>' % html.encode('utf-8', 'replace'))
        html_stream.write(html.encode('utf-8', 'replace'))
        html_stream.write('</body></html>') 
        if pclose:
            pml_stream.close()
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@ -25,27 +25,27 @@ PML_HTML_RULES = [
    # (and also makes sure we DO honor \\\x as &#92; followed by \x).
    (re.compile(r'\\(.)'), lambda match: '&#92;' if match.group(1) == '\\' else '\\' + match.group(1)),
-    (re.compile(r'\\p'), lambda match: '<br /><br class="p" />'),
+    (re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
-    (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 class="x">%s</h1>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s>%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
    (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
-    (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div class="c">%s</div>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div class="r">%s</div>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right;">%s</div>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<u>%s</u>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div class="t">%s</div>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%;">%s</div>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''),
    (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
    (re.compile(r'\\n'), lambda match: ''),
-    (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span class="s">%s</span>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span style="font-size: 1em;">%s</span>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\b(?P<text>.*?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead.
-    (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span class="l">%s</span>' % match.group('text') if match.group('text') else ''),
+    (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span style="font-size: 1.5em;">%s</span>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\B(?P<text>.*?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
    (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span class="k">%s</span>' % match.group('text').upper() if match.group('text') else ''),
+    (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 0.75em;">%s</span>' % match.group('text').upper() if match.group('text') else ''),
    (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
    (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
    (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),