PDF Input: Fix < and > in the text not being correctly handled

This commit is contained in:
Kovid Goyal 2019-08-30 08:03:08 +05:30
parent b0fe64571e
commit 05834f0b42
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -11,7 +11,7 @@ import shutil
import subprocess import subprocess
import sys import sys
from calibre import CurrentDir, replace_entities, prints from calibre import CurrentDir, xml_replace_entities, prints
from calibre.constants import ( from calibre.constants import (
filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows
) )
@ -106,7 +106,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I)
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
raw = replace_entities(raw) raw = xml_replace_entities(raw)
raw = raw.replace('\u00a0', ' ') raw = raw.replace('\u00a0', ' ')
i.write(raw.encode('utf-8')) i.write(raw.encode('utf-8'))