mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
preserve style changes across line breaks in heuristics, fixes bug 1066507
This commit is contained in:
parent
92afbd1ab5
commit
67deb3adcd
@ -335,14 +335,26 @@ class HeuristicProcessor(object):
|
||||
This function intentionally leaves hyphenated content alone as that is handled by the
|
||||
dehyphenate routine in a separate step
|
||||
'''
|
||||
def style_unwrap(match):
|
||||
style_close = match.group('style_close')
|
||||
style_open = match.group('style_open')
|
||||
if style_open and style_close:
|
||||
return style_close+' '+style_open
|
||||
elif style_open and not style_close:
|
||||
return ' '+style_open
|
||||
elif not style_open and style_close:
|
||||
return style_close+' '
|
||||
else:
|
||||
return ' '
|
||||
|
||||
|
||||
# define the pieces of the regex
|
||||
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||
soft_hyphen = u"\xad"
|
||||
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||
line_ending = "\s*(?P<style_close></(span|[iub])>)?\s*(</(p|div)>)?"
|
||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
||||
line_opening = "<(p|div)[^>]*>\s*(?P<style_open><(span|[iub])[^>]*>)?\s*"
|
||||
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||
|
||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||
@ -353,14 +365,17 @@ class HeuristicProcessor(object):
|
||||
unwrap_regex = lookahead+txt_line_wrap
|
||||
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||
content = unwrap.sub(' ', content)
|
||||
content = em_en_unwrap.sub('', content)
|
||||
content = shy_unwrap.sub('', content)
|
||||
else:
|
||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||
content = unwrap.sub(style_unwrap, content)
|
||||
content = em_en_unwrap.sub(style_unwrap, content)
|
||||
content = shy_unwrap.sub(style_unwrap, content)
|
||||
|
||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||
|
||||
content = unwrap.sub(' ', content)
|
||||
content = em_en_unwrap.sub('', content)
|
||||
content = shy_unwrap.sub('', content)
|
||||
return content
|
||||
|
||||
def txt_process(self, match):
|
||||
|
Loading…
x
Reference in New Issue
Block a user