diff --git a/src/calibre/ebooks/conversion/plugins/pdf_input.py b/src/calibre/ebooks/conversion/plugins/pdf_input.py index 3bbe33207d..20cc20d687 100644 --- a/src/calibre/ebooks/conversion/plugins/pdf_input.py +++ b/src/calibre/ebooks/conversion/plugins/pdf_input.py @@ -67,5 +67,12 @@ class PDFInput(InputFormatPlugin): log.debug('Rendering manifest...') with open(u'metadata.opf', 'wb') as opffile: opf.render(opffile) + if os.path.exists(u'toc.ncx'): + ncxid = opf.manifest.id_for_path('toc.ncx') + if ncxid: + with open(u'metadata.opf', 'r+b') as f: + raw = f.read().replace(b'\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '

\n

' + match.group('break') + '

'), - # Remove page links - (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: ''), diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index c9e3c32395..fe5bbde73a 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -85,10 +85,6 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): logf.flush() logf.close() out = open(logf.name, 'rb').read().strip() - try: - os.remove(pdfsrc) - except: - pass if ret != 0: raise ConversionError(b'return code: %d\n%s' % (ret, out)) if out: @@ -106,7 +102,46 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): i.truncate() # versions of pdftohtml >= 0.20 output self closing
tags, this # breaks the pdf heuristics regexps, so replace them - i.write(raw.replace(b'
', b'
')) + raw = raw.replace(b'
', b'
') + raw = re.sub(br'