mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
slightly better method
This commit is contained in:
parent
070df88d20
commit
80cbd6d89f
@ -28,8 +28,8 @@ class HeuristicProcessor(object):
|
|||||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.multi_blank = re.compile(r'(\s*(<div[^>]*>\s*)?<p[^>]*>\s*</p>(\s*</div>)?(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||||
self.any_multi_blank = re.compile(r'(\s*(<div[^>]*>\s*)?<p[^>]*>\s*</p>(\s*</div>)?(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
||||||
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||||
self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||||
self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
|
self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
|
||||||
@ -379,6 +379,8 @@ class HeuristicProcessor(object):
|
|||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
# Re-open self closing paragraph tags
|
# Re-open self closing paragraph tags
|
||||||
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
||||||
|
# delete surrounding divs from empty paragraphs
|
||||||
|
html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
|
||||||
# Get rid of empty span, bold, font, em, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
||||||
@ -637,6 +639,7 @@ class HeuristicProcessor(object):
|
|||||||
blanks_count = len(self.any_multi_blank.findall(html))
|
blanks_count = len(self.any_multi_blank.findall(html))
|
||||||
if blanks_count >= 1:
|
if blanks_count >= 1:
|
||||||
html = self.merge_blanks(html, blanks_count)
|
html = self.merge_blanks(html, blanks_count)
|
||||||
|
self.dump(html, 'before_after_merge_blanks')
|
||||||
scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
|
scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
|
||||||
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||||
# If the user has enabled scene break replacement, then either softbreaks
|
# If the user has enabled scene break replacement, then either softbreaks
|
||||||
|
Loading…
x
Reference in New Issue
Block a user