mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Further speedup for polish parsing
This commit is contained in:
parent
b86ad609b5
commit
55c2ad77ce
@ -14,6 +14,12 @@ from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
try:
|
||||
from calibre_extensions.fast_html_entities import replace_all_entities
|
||||
except ImportError:
|
||||
def replace_all_entities(raw, keep_xml_entities: bool = False):
|
||||
xml_replace_entities(raw)
|
||||
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
|
||||
|
||||
@ -21,7 +27,7 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw)
|
||||
raw = replace_all_entities(raw, True)
|
||||
if fix_newlines:
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
raw = clean_xml_chars(raw)
|
||||
@ -60,7 +66,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
raw = handle_private_entities(raw)
|
||||
if replace_entities:
|
||||
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
||||
raw = replace_all_entities(raw, True)
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# Remove any preamble before the opening html tag as it can cause problems,
|
||||
|
Loading…
x
Reference in New Issue
Block a user