diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 7488df4609..9962335da3 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -451,27 +451,33 @@ class HeuristicProcessor(object): return html def detect_whitespace(self, html): - blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) + blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) + blanks_around_scene_breaks = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P

]*>.*?

)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) blanks_n_nopunct = re.compile(r'(?P(]*>\s*

\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](\s*)*

(?P\s*(]*>\s*

\s*){1,})?', re.IGNORECASE|re.DOTALL) def merge_header_whitespace(match): initblanks = match.group('initparas') - endblanks = match.group('initparas') - heading = match.group('heading') + endblanks = match.group('endparas') + content = match.group('content') top_margin = '' bottom_margin = '' if initblanks is not None: + print "initial blanks are:\n"+initblanks top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;' if endblanks is not None: - bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;' + print "endblanks blanks are:\n"+endblanks + bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;' if initblanks == None and endblanks == None: - return heading + return content + elif content.find('scenebreak') != -1: + return content else: - heading = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', heading) - return heading + content = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', content) + return content html = blanks_around_headings.sub(merge_header_whitespace, html) + html = blanks_around_scene_breaks.sub(merge_header_whitespace, html) def markup_whitespaces(match): blanks = match.group(0) @@ -506,6 +512,12 @@ class HeuristicProcessor(object): html = self.blankreg.sub('\n

', html) return html + def detect_scene_breaks(self, html): + scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close + scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) + html = scene_breaks.sub(self.scene_break_open+'\g'+'

', html) + return html + def markup_user_break(self, replacement_break): ''' Takes string a user supplies and wraps it in markup that will be centered with @@ -765,25 +777,25 @@ class HeuristicProcessor(object): # If non-blank scene breaks exist they are center aligned and styled with appropriate margins. if getattr(self.extra_opts, 'format_scene_breaks', False): html = re.sub('(?i)]*>\s*\s*', '

', html) + html = self.detect_scene_breaks(html) html = self.detect_whitespace(html) html = self.detect_soft_breaks(html) blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) - scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close - scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) + detected_scene_break = re.compile(r'

]*>.*?

') + scene_break_count = len(detected_scene_break.findall(html)) # If the user has enabled scene break replacement, then either softbreaks # or 'hard' scene breaks are replaced, depending on which is in use # Otherwise separator lines are centered, use a bit larger margin in this case replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None) if replacement_break: replacement_break = self.markup_user_break(replacement_break) - if len(scene_break.findall(html)) >= 1: - html = scene_break.sub(replacement_break, html) + if scene_break_count >= 1: + html = detected_scene_break.sub(replacement_break, html) + html = re.sub(']*>\s*

', replacement_break, html) else: html = re.sub(']*>\s*

', replacement_break, html) - else: - html = scene_break.sub(self.scene_break_open+'\g'+'

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs so they render correctly