From cdb0c9afe286bdb48b15ed1c87664243ec9e49c4 Mon Sep 17 00:00:00 2001 From: Lee Date: Tue, 22 Feb 2011 10:56:01 +0800 Subject: [PATCH] handle soft breaks as defined in some mobi files --- src/calibre/ebooks/conversion/utils.py | 39 ++++++++++++++++++-------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 3559f13440..52fc537e3c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -33,7 +33,7 @@ class HeuristicProcessor(object): self.any_multi_blank = re.compile(r'(\s*]*>\s*

(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" self.line_close = "()?\s*()?\s*()?\s*" - self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) + self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*)', re.IGNORECASE) self.scene_break_open = '

' self.common_in_text_endings = u'[\"\'—’”,\.!\?\…\)„\w]' self.common_in_text_beginnings = u'[\w\'\"“‘‛]' @@ -451,8 +451,8 @@ class HeuristicProcessor(object): return html def detect_whitespace(self, html): - blanks_around_headings = re.compile(r'(?P(]*>\s*

\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(]*>\s*

\s*){1,})?', re.IGNORECASE) - blanks_n_nopunct = re.compile(r'(?P(]*>\s*

\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](\s*)*

(?P\s*(]*>\s*

\s*){1,})?', re.IGNORECASE) + blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) + blanks_n_nopunct = re.compile(r'(?P(]*>\s*

\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](\s*)*

(?P\s*(]*>\s*

\s*){1,})?', re.IGNORECASE|re.DOTALL) def merge_header_whitespace(match): initblanks = match.group('initparas') @@ -485,6 +485,21 @@ class HeuristicProcessor(object): return html def detect_soft_breaks(self, html): + line = '(?P'+self.line_open+'\s*(?P.*?)'+self.line_close+')' + line_two = '(?P'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+'\s*(?P.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')' + div_break_candidate_pattern = line+'\s*]*>\s*\s*'+line_two + div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE) + + def convert_div_softbreaks(match): + init_is_paragraph = self.check_paragraph(match.group('init_content')) + line_two_is_paragraph = self.check_paragraph(match.group('line_two_content')) + if init_is_paragraph and line_two_is_paragraph: + return match.group('initline')+'\n

\n'+match.group('line_two') + else: + return match.group(0) + + html = div_break_candidate.sub(convert_div_softbreaks, html) + if not self.blanks_deleted and self.blanks_between_paragraphs: html = self.multi_blank.sub('\n

', html) else: @@ -523,6 +538,14 @@ class HeuristicProcessor(object): return scene_break + def check_paragraph(self, content): + content = re.sub('\s*]*>\s*', '', content) + if re.match('.*[\"\'.!?:]$', content): + #print "detected this as a paragraph" + return True + else: + return False + def abbyy_processor(self, html): abbyy_line = re.compile('((?P[^\"]*?);?">)(?P.*?)(?P

)|(?P]*>))', re.IGNORECASE) empty_paragraph = '\n

\n' @@ -530,14 +553,6 @@ class HeuristicProcessor(object): self.previous_was_paragraph = False html = re.sub(']*>', '', html) - def check_paragraph(content): - content = re.sub('\s*]*>\s*', '', content) - if re.match('.*[\"\'.!?:]$', content): - #print "detected this as a paragraph" - return True - else: - return False - def convert_styles(match): #print "raw styles are: "+match.group('styles') content = match.group('content') @@ -565,7 +580,7 @@ class HeuristicProcessor(object): return blockquote_close_loop+'\n'+image+'\n' else: styles = match.group('styles').split(';') - is_paragraph = check_paragraph(content) + is_paragraph = self.check_paragraph(content) #print "styles for this line are: "+str(styles) split_styles = [] for style in styles: