pdftohtml processing: better line re-wrapping

2026-03-06 00:43:42 -05:00 · 2009-04-09 17:31:07 -04:00 · 2009-04-09 17:31:07 -04:00 · f7ec532d57
commit f7ec532d57
parent 2c3e8cccb8
1 changed files with 8 additions and 0 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -48,6 +48,8 @@ class HTMLPreProcessor(object):

    # Fix pdftohtml markup
    PDFTOHTML  = [
+                  # Remove page links
+                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                  # Remove page numbers
@ -69,6 +71,12 @@ class HTMLPreProcessor(object):
                  
                  # Have paragraphs show better
                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
+                  
+                  # Re wrap lines
+                  (re.compile(r'(?<=\w)\s*</i>\s*<p.*?>\s*<i>\s*(?=\w)'), lambda match: ' '),
+                  (re.compile(r'(?<=\w)\s*<p.*?>\s*(?=\w)', re.UNICODE), lambda match: ' '),
+                  # Clean up spaces
+                  (re.compile(ru'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '),
                  ]

    # Fix Book Designer markup