TXT Output: When using preserve spaces, output tab characters as a sequence of four non-breaking spaces as some readers dont handle the \x09 char code. Fix #6770 (Problem converting pmlz to epub). PDF Input: More unicode character matching.

2025-07-09 03:04:10 -04:00 · 2010-09-11 11:58:32 -06:00 · 2010-09-11 11:58:32 -06:00 · 1e77e6538f
commit 1e77e6538f
parent 3766f34aab ef8408869c
3 changed files with 15 additions and 3 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -166,6 +166,17 @@ class HTMLPreProcessor(object):
                  (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                  (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                  (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
+                  # ` with letter before
+                  (re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
+                  (re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
+                  (re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
+                  (re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
+                  (re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
+                  (re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
+                  (re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
+                  (re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
+                  (re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
+                  (re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),

                  # ´
                  (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@ -207,6 +207,7 @@ class PML_HTMLizer(object):
        while html != old:
            old = html
            html = self.cleanup_html_remove_redundant(html)
+        html = re.sub(r'(?imu)^\s*', '', html)
        return html

    def cleanup_html_remove_redundant(self, html):
@ -216,7 +217,7 @@ class PML_HTMLizer(object):
                html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
            else:
                html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
-        html = re.sub(r'<p>\s*</p>', '', html)
+        html = re.sub(r'(?imu)<p>\s*</p>', '', html)
        return html

    def start_line(self):
@ -556,7 +557,7 @@ class PML_HTMLizer(object):
                            text = t
                        else:
                            self.toc.add_item(os.path.basename(self.file_name), id, value)
-                            text = '<span id="%s"></span>%s' % (id, t)
+                            text = '%s<span id="%s"></span>' % (t, id)
                    elif c == 'm':
                        empty = False
                        src = self.code_value(line)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):

 def preserve_spaces(txt):
    txt = txt.replace(' ', '&nbsp;')
-    txt = txt.replace('\t', '&#09;')
+    txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
    return txt

 def opf_writer(path, opf_name, manifest, spine, mi):