+ (re.compile(r' '),
+ (re.compile(r'(.*)
after first to allow paragraphs to show better
+ (re.compile(r'
'),
+
+ ]
+
+
def pdftohtml(pdf_path):
'''
Convert the pdf into html using the pdftohtml app.
@@ -72,4 +98,9 @@ def pdftohtml(pdf_path):
if not '
\n' + raw
+ return '\n' + processed_html(raw)
+
+def processed_html(html):
+ for rule in PDFTOHTML_RULES:
+ html = rule[0].sub(rule[1], html)
+ return html