process pdf input html output a bit.

This commit is contained in:
John Schember 2009-04-08 19:51:56 -04:00
parent 3b09d01701
commit 383fe33adb

View File

@ -6,7 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
'2009, John Schember <john@nachtimwald.com>' '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import errno, os, sys, subprocess import errno, os, re, sys, subprocess
from functools import partial from functools import partial
from calibre.ebooks import ConversionError, DRMError from calibre.ebooks import ConversionError, DRMError
@ -24,6 +24,32 @@ if iswindows and hasattr(sys, 'frozen'):
if islinux and getattr(sys, 'frozen_path', False): if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
# Fix pdftohtml markup
PDFTOHTML_RULES = [
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
# Remove page numbers
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
else match.group(1)),
# Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
# Add second <br /> after first to allow paragraphs to show better
(re.compile(r'<br.*?>'), lambda match : '<br /><br />'),
]
def pdftohtml(pdf_path): def pdftohtml(pdf_path):
''' '''
Convert the pdf into html using the pdftohtml app. Convert the pdf into html using the pdftohtml app.
@ -72,4 +98,9 @@ def pdftohtml(pdf_path):
if not '<br' in raw[:4000]: if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True) raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
return '<!-- created by calibre\'s pdftohtml -->\n' + raw return '<!-- created by calibre\'s pdftohtml -->\n' + processed_html(raw)
def processed_html(html):
for rule in PDFTOHTML_RULES:
html = rule[0].sub(rule[1], html)
return html