mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
commit
80c062c62b
@ -559,7 +559,7 @@ class HTMLPreProcessor(object):
|
|||||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
|
@ -316,10 +316,18 @@ class HeuristicProcessor(object):
|
|||||||
'''
|
'''
|
||||||
Unwraps lines based on line length and punctuation
|
Unwraps lines based on line length and punctuation
|
||||||
supports a range of html markup and text files
|
supports a range of html markup and text files
|
||||||
|
|
||||||
|
the lookahead regex below is meant look for any non-full stop characters - punctuation
|
||||||
|
characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
|
||||||
|
the reason for this is to prevent false positive wrapping. False positives are more
|
||||||
|
difficult to detect than false negatives during a manual review of the doc
|
||||||
|
|
||||||
|
This function intentionally leaves hyphenated content alone as that is handled by the
|
||||||
|
dehyphenate routine in a separate step
|
||||||
'''
|
'''
|
||||||
# define the pieces of the regex
|
|
||||||
|
|
||||||
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
# define the pieces of the regex
|
||||||
|
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||||
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||||
soft_hyphen = u"\xad"
|
soft_hyphen = u"\xad"
|
||||||
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user