diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f544a331d8..bb8ee90364 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -66,7 +66,9 @@ class HTMLPreProcessor(object): # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), - + + # Have paragraphs show better + (re.compile(r''), lambda match : '

'), ] # Fix Book Designer markup diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 0f6581dea6..e03d7d0647 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -24,32 +24,6 @@ if iswindows and hasattr(sys, 'frozen'): if islinux and getattr(sys, 'frozen_path', False): PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') -# Fix pdftohtml markup -PDFTOHTML_RULES = [ - # Remove


tags - (re.compile(r'', re.IGNORECASE), lambda match: '
'), - # Remove page numbers - (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), - # Remove
and replace

with

- (re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 - else match.group(1)), - # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), - - # Remove gray background - (re.compile(r']+>'), lambda match : ''), - - # Remove non breaking spaces - (re.compile(ur'\u00a0'), lambda match : ' '), - - # Add second
after first to allow paragraphs to show better - (re.compile(r''), lambda match : '

'), - - ] - - def pdftohtml(pdf_path): ''' Convert the pdf into html using the pdftohtml app. @@ -98,9 +72,4 @@ def pdftohtml(pdf_path): if not '\n' + processed_html(raw) - -def processed_html(html): - for rule in PDFTOHTML_RULES: - html = rule[0].sub(rule[1], html) - return html + return '\n' + raw