From 7485c9a5e200c20285ead3795025781f4d9ef31e Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 24 Jan 2011 16:30:29 +0800 Subject: [PATCH] fixed sigil integration to strip html from chapter titles, fixed softbreak handling and enabled integration with extra_css --- src/calibre/ebooks/conversion/utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index d9350e6adb..f6e259b6f9 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -25,13 +25,15 @@ class HeuristicProcessor(object): self.chapters_with_title = 0 self.blanks_deleted = False self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) def is_pdftohtml(self, src): return '' in src[:1000] def chapter_head(self, match): + from calibre.utils.html2text import html2text chap = match.group('chap') title = match.group('title') if not title: @@ -40,10 +42,12 @@ class HeuristicProcessor(object): " chapters. - " + unicode(chap)) return '

'+chap+'

\n' else: + txt_chap = html2text(chap) + txt_title = html2text(title) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) - return '

'+chap+'

\n

'+title+'

\n' + return '

'+chap+'

\n

'+title+'

\n' def chapter_break(self, match): chap = match.group('section') @@ -469,7 +473,7 @@ class HeuristicProcessor(object): if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug("deleting blank lines") self.blanks_deleted = True - html = self.multi_blank.sub('\n

', html) + html = self.multi_blank.sub('\n

', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -524,11 +528,11 @@ class HeuristicProcessor(object): # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) if not self.blanks_deleted: - html = self.multi_blank.sub('\n

', html) - html = re.sub(']*>\s*

', '

', html) + html = self.multi_blank.sub('\n

', html) + html = re.sub(']*>\s*

', '

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs to preserve original formatting html = self.blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) - + html = self.softbreak.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html