mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
preserve soft breaks when deleting blank paragraphs
This commit is contained in:
parent
65f9eff665
commit
f467498638
@ -294,8 +294,8 @@ class PreProcessor(object):
|
|||||||
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||||
# blank paragraphs then delete blank lines to clean up spacing
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||||
blanklines = blankreg.findall(html)
|
blanklines = blankreg.findall(html)
|
||||||
lines = linereg.findall(html)
|
lines = linereg.findall(html)
|
||||||
blanks_between_paragraphs = False
|
blanks_between_paragraphs = False
|
||||||
@ -303,11 +303,8 @@ class PreProcessor(object):
|
|||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
|
self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
|
||||||
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
|
||||||
'delete_blank_paragraphs', False):
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
self.log("deleting blank lines")
|
|
||||||
html = blankreg.sub('', html)
|
|
||||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
|
||||||
blanks_between_paragraphs = True
|
blanks_between_paragraphs = True
|
||||||
print "blanks between paragraphs is marked True"
|
print "blanks between paragraphs is marked True"
|
||||||
else:
|
else:
|
||||||
@ -319,7 +316,12 @@ class PreProcessor(object):
|
|||||||
|
|
||||||
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
||||||
|
|
||||||
|
if blanks_between_paragraphs and getattr(self.extra_opts,
|
||||||
|
'delete_blank_paragraphs', False):
|
||||||
|
self.log("deleting blank lines")
|
||||||
|
html = multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = blankreg.sub('', html)
|
||||||
|
|
||||||
###### Unwrap lines ######
|
###### Unwrap lines ######
|
||||||
#
|
#
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
|
Loading…
x
Reference in New Issue
Block a user