mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Revert previous changes, now looking for entities in unwrapping rule
This commit is contained in:
parent
66b443adc5
commit
569b84e1cb
@ -17,6 +17,8 @@ convert_entities = functools.partial(entity_to_unicode,
|
||||
result_exceptions = {
|
||||
u'<' : '<',
|
||||
u'>' : '>',
|
||||
u"'" : ''',
|
||||
u'"' : '"',
|
||||
u'&' : '&',
|
||||
})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
@ -349,7 +351,7 @@ class HTMLPreProcessor(object):
|
||||
# print "The pdf line length returned is " + str(length)
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
|
Loading…
x
Reference in New Issue
Block a user