font|span|[ibu])[^>]*>)?\s*"
self.line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
- self.single_blank = re.compile(r'(\s*]*>\s*
)', re.IGNORECASE)
+ self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*(p|div)>)', re.IGNORECASE)
self.scene_break_open = ''
self.common_in_text_endings = u'[\"\'—’”,\.!\?\…\)„\w]'
self.common_in_text_beginnings = u'[\w\'\"“‘‛]'
@@ -451,8 +451,8 @@ class HeuristicProcessor(object):
return html
def detect_whitespace(self, html):
- blanks_around_headings = re.compile(r'(?P(]*>\s*
\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE)
- blanks_n_nopunct = re.compile(r'(?P(]*>\s*
\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)*
(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE)
+ blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+ blanks_n_nopunct = re.compile(r'(?P(]*>\s*
\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)*
(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
@@ -485,6 +485,21 @@ class HeuristicProcessor(object):
return html
def detect_soft_breaks(self, html):
+ line = '(?P'+self.line_open+'\s*(?P.*?)'+self.line_close+')'
+ line_two = '(?P'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+'\s*(?P.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
+ div_break_candidate_pattern = line+'\s*]*>\s*
\s*'+line_two
+ div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
+
+ def convert_div_softbreaks(match):
+ init_is_paragraph = self.check_paragraph(match.group('init_content'))
+ line_two_is_paragraph = self.check_paragraph(match.group('line_two_content'))
+ if init_is_paragraph and line_two_is_paragraph:
+ return match.group('initline')+'\n
\n'+match.group('line_two')
+ else:
+ return match.group(0)
+
+ html = div_break_candidate.sub(convert_div_softbreaks, html)
+
if not self.blanks_deleted and self.blanks_between_paragraphs:
html = self.multi_blank.sub('\n
', html)
else:
@@ -523,6 +538,14 @@ class HeuristicProcessor(object):
return scene_break
+ def check_paragraph(self, content):
+ content = re.sub('\s*?span[^>]*>\s*', '', content)
+ if re.match('.*[\"\'.!?:]$', content):
+ #print "detected this as a paragraph"
+ return True
+ else:
+ return False
+
def abbyy_processor(self, html):
abbyy_line = re.compile('((?P[^\"]*?);?">)(?P.*?)(?P
)|(?P
]*>))', re.IGNORECASE)
empty_paragraph = '\n
\n'
@@ -530,14 +553,6 @@ class HeuristicProcessor(object):
self.previous_was_paragraph = False
html = re.sub('?a[^>]*>', '', html)
- def check_paragraph(content):
- content = re.sub('\s*?span[^>]*>\s*', '', content)
- if re.match('.*[\"\'.!?:]$', content):
- #print "detected this as a paragraph"
- return True
- else:
- return False
-
def convert_styles(match):
#print "raw styles are: "+match.group('styles')
content = match.group('content')
@@ -565,7 +580,7 @@ class HeuristicProcessor(object):
return blockquote_close_loop+'\n'+image+'\n'
else:
styles = match.group('styles').split(';')
- is_paragraph = check_paragraph(content)
+ is_paragraph = self.check_paragraph(content)
#print "styles for this line are: "+str(styles)
split_styles = []
for style in styles: