PDF input: Replace U+2029 with spaces. See #1917386 (search & replace doesnt work with "newlines" in PDF to MOBI conversion)

This commit is contained in:
Kovid Goyal 2021-03-02 08:50:22 +05:30
parent 3b79e215e8
commit 20f4e43044
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -101,7 +101,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
raw = xml_replace_entities(raw)
raw = raw.replace('\u00a0', ' ')
raw = re.sub('[\u00a0\u2029]', ' ', raw)
i.write(raw.encode('utf-8'))