diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index e7707479c3..0f6581dea6 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -6,7 +6,7 @@ __copyright__ = '2008, Kovid Goyal , ' \ '2009, John Schember ' __docformat__ = 'restructuredtext en' -import errno, os, sys, subprocess +import errno, os, re, sys, subprocess from functools import partial from calibre.ebooks import ConversionError, DRMError @@ -24,6 +24,32 @@ if iswindows and hasattr(sys, 'frozen'): if islinux and getattr(sys, 'frozen_path', False): PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') +# Fix pdftohtml markup +PDFTOHTML_RULES = [ + # Remove
tags + (re.compile(r'', re.IGNORECASE), lambda match: '
'), + # Remove page numbers + (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), + # Remove
and replace

with

+ (re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), + (re.compile(r'(.*)', re.IGNORECASE), + lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 + else match.group(1)), + # Remove hyphenation + (re.compile(r'-\n\r?'), lambda match: ''), + + # Remove gray background + (re.compile(r']+>'), lambda match : ''), + + # Remove non breaking spaces + (re.compile(ur'\u00a0'), lambda match : ' '), + + # Add second
after first to allow paragraphs to show better + (re.compile(r''), lambda match : '

'), + + ] + + def pdftohtml(pdf_path): ''' Convert the pdf into html using the pdftohtml app. @@ -72,4 +98,9 @@ def pdftohtml(pdf_path): if not '\n' + raw + return '\n' + processed_html(raw) + +def processed_html(html): + for rule in PDFTOHTML_RULES: + html = rule[0].sub(rule[1], html) + return html