From 6f252bb1050a6a7d66dcad365fb3992088f9fe86 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 13:34:35 +0800 Subject: [PATCH 1/4] tied all the new heuristics options to preprocess.utils --- src/calibre/ebooks/conversion/plumber.py | 2 +- src/calibre/ebooks/conversion/utils.py | 177 ++++++++++++----------- 2 files changed, 97 insertions(+), 82 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index a12dbd48e1..48b965f624 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -523,7 +523,7 @@ OptionRecommendation(name='delete_blank_paragraphs', OptionRecommendation(name='format_scene_breaks', recommended_value=False, level=OptionRecommendation.LOW, - help=_('Detects left aligned scene break markers and center aligns them. ' + help=_('left aligned scene break markers are center aligned. ' 'Replace soft scene breaks that use multiple blank lines with' 'horizontal rules.')), diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 417f3a1e5b..68afc464a0 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -18,6 +18,11 @@ class PreProcessor(object): self.html_preprocess_sections = 0 self.found_indents = 0 self.extra_opts = extra_opts + self.deleted_nbsps = False + self.min_chapters = 1 + self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) + self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) def is_pdftohtml(self, src): return '' in src[:1000] @@ -120,7 +125,6 @@ class PreProcessor(object): ''' # Typical chapters are between 2000 and 7000 words, use the larger number to decide the # minimum of chapters to search for - self.min_chapters = 1 if wordcount > 7000: self.min_chapters = int(ceil(wordcount / 7000.)) #print "minimum chapters required are: "+str(self.min_chapters) @@ -192,21 +196,28 @@ class PreProcessor(object): def punctuation_unwrap(self, length, content, format): ''' Unwraps lines based on line length and punctuation - supports range of potential html markup and text files + supports a range of html markup and text files ''' # define the pieces of the regex lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?" + em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])" + line_ending = "\s*\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" - line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" + line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*" txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" unwrap_regex = lookahead+line_ending+blanklines+line_opening + em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening + if format == 'txt': unwrap_regex = lookahead+txt_line_wrap + em_en_unwrap_regex = em_en_lookahead+txt_line_wrap unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) + em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) + content = unwrap.sub(' ', content) + content = em_en_unwrap.sub('', content) return content @@ -253,8 +264,38 @@ class PreProcessor(object): html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*\s*){0,2}\s*", " ", html) html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) + self.deleted_nbsps = True return html + def analyze_line_endings(self, html): + ''' + determines the type of html line ending used most commonly in a document + use before calling docanalysis functions + ''' + paras_reg = re.compile(']*>', re.IGNORECASE) + spans_reg = re.compile(']*>', re.IGNORECASE) + paras = len(paras_reg.findall(html)) + spans = len(spans_reg.findall(html)) + if spans > 1: + if float(paras) / float(spans) < 0.75: + return 'spanned_html' + else: + return 'html' + else: + return 'html' + + def analyze_blanks(self, html): + blanklines = self.blankreg.findall(html) + lines = self.linereg.findall(html) + if len(lines) > 1: + self.log("There are " + unicode(len(blanklines)) + " blank lines. " + + unicode(float(len(blanklines)) / float(len(lines))) + " percent blank") + + if float(len(blanklines)) / float(len(lines)) > 0.40: + return True + else: + return False + def __call__(self, html): self.log("********* Preprocessing HTML *********") @@ -281,97 +322,69 @@ class PreProcessor(object): # check if content is in pre tags, use txt processor to mark up if so html = self.text_process_pre(html) - ###### Mark Indents/Cleanup ###### - # # Replace series of non-breaking spaces with text-indent - html = self.fix_nbsp_indents(html) + if getattr(self.extra_opts, 'fix_indents', True): + html = self.fix_nbsp_indents(html) html = self.cleanup_markup(html) # ADE doesn't render
, change to empty paragraphs #html = re.sub(']*>', u'

\u00a0

', html) - # If more than 40% of the lines are empty paragraphs and the user has enabled delete - # blank paragraphs then delete blank lines to clean up spacing - linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) - blanklines = blankreg.findall(html) - lines = linereg.findall(html) - blanks_between_paragraphs = False - print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', False)) - if len(lines) > 1: - self.log("There are " + unicode(len(blanklines)) + " blank lines. " + - unicode(float(len(blanklines)) / float(len(lines))) + " percent blank") - - if float(len(blanklines)) / float(len(lines)) > 0.40: - blanks_between_paragraphs = True - print "blanks between paragraphs is marked True" - else: - blanks_between_paragraphs = False + # Determine whether the document uses interleaved blank lines + blanks_between_paragraphs = self.analyze_blanks(html) #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic - # - html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) + if getattr(self.extra_opts, 'markup_chapter_headings', True): + html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) + # If more than 40% of the lines are empty paragraphs and the user has enabled delete + # blank paragraphs then delete blank lines to clean up spacing if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log("deleting blank lines") - html = multi_blank.sub('\n

', html) - html = blankreg.sub('', html) + html = self.multi_blank.sub('\n

', html) + html = self.blankreg.sub('', html) ###### Unwrap lines ###### - # - # Some OCR sourced files have line breaks in the html using a combination of span & p tags - # span are used for hard line breaks, p for new paragraphs. Determine which is used so - # that lines can be un-wrapped across page boundaries - paras_reg = re.compile(']*>', re.IGNORECASE) - spans_reg = re.compile(']*>', re.IGNORECASE) - paras = len(paras_reg.findall(html)) - spans = len(spans_reg.findall(html)) - if spans > 1: - if float(paras) / float(spans) < 0.75: - format = 'spanned_html' - else: - format = 'html' - else: - format = 'html' - # Check Line histogram to determine if the document uses hard line breaks, If 50% or - # more of the lines break in the same region of the document then unwrapping is required - docanalysis = DocAnalysis(format, html) - hardbreaks = docanalysis.line_histogram(.50) - self.log("Hard line breaks check returned "+unicode(hardbreaks)) - # Calculate Length - unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) - length = docanalysis.line_length(unwrap_factor) - self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") - # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor - if hardbreaks or unwrap_factor < 0.4: - self.log("Unwrapping required, unwrapping Lines") - # Unwrap em/en dashes - html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html) - # Dehyphenate - self.log("Unwrapping/Removing hyphens") - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html', length) - self.log("Done dehyphenating") - # Unwrap lines using punctation and line length - #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - html = self.punctuation_unwrap(length, html, 'html') - #check any remaining hyphens, but only unwrap if there is a match - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html_cleanup', length) - else: - # dehyphenate in cleanup mode to fix anything previous conversions/editing missed - self.log("Cleaning up hyphenation") - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html_cleanup', length) - self.log("Done dehyphenating") + if getattr(self.extra_opts, 'unwrap_lines', True): + # Determine line ending type + # Some OCR sourced files have line breaks in the html using a combination of span & p tags + # span are used for hard line breaks, p for new paragraphs. Determine which is used so + # that lines can be un-wrapped across page boundaries + format = self.analyze_line_endings(html) - # delete soft hyphens - html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) + # Check Line histogram to determine if the document uses hard line breaks, If 50% or + # more of the lines break in the same region of the document then unwrapping is required + docanalysis = DocAnalysis(format, html) + hardbreaks = docanalysis.line_histogram(.50) + self.log("Hard line breaks check returned "+unicode(hardbreaks)) + + # Calculate Length + unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) + length = docanalysis.line_length(unwrap_factor) + self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") + + # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor + if hardbreaks or unwrap_factor < 0.4: + self.log("Unwrapping required, unwrapping Lines") + # Dehyphenate with line length limiters + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html', length) + html = self.punctuation_unwrap(length, html, 'html') + #check any remaining hyphens, but only unwrap if there is a match + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) + + if getattr(self.extra_opts, 'dehyphenate', True): + # dehyphenate in cleanup mode to fix anything previous conversions/editing missed + self.log("Fixing hyphenated content") + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) + # delete soft hyphens + html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < self.min_chapters: @@ -385,10 +398,12 @@ class PreProcessor(object): doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) html = doubleheading.sub('\g'+'\n'+'', html) - # put back non-breaking spaces in empty paragraphs to preserve original formatting - html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) + if getattr(self.extra_opts, 'dehyphenate', True): + # Center separator lines + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) - # Center separator lines - html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) + if self.deleted_nbsps: + # put back non-breaking spaces in empty paragraphs to preserve original formatting + html = self.blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html From e581c8c5dedf0e68fa5c3ca5a06d660546e9996c Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 15:40:59 +0800 Subject: [PATCH 2/4] created sub-functions for text processing, added soft hyphens to punctuation unwrap --- src/calibre/ebooks/conversion/utils.py | 43 +++++++++++++++----------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 68afc464a0..99685e90d1 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -19,6 +19,7 @@ class PreProcessor(object): self.found_indents = 0 self.extra_opts = extra_opts self.deleted_nbsps = False + self.totalwords = 0 self.min_chapters = 1 self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) @@ -201,6 +202,7 @@ class PreProcessor(object): # define the pieces of the regex lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*" @@ -208,10 +210,12 @@ class PreProcessor(object): unwrap_regex = lookahead+line_ending+blanklines+line_opening em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening + shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening if format == 'txt': unwrap_regex = lookahead+txt_line_wrap em_en_unwrap_regex = em_en_lookahead+txt_line_wrap + shy_unwrap_regex = soft_hyphen+txt_line_wrap unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) @@ -220,18 +224,21 @@ class PreProcessor(object): content = em_en_unwrap.sub('', content) return content + def txt_process(self, match): + from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ + separate_paragraphs_single_line + content = match.group('text') + content = separate_paragraphs_single_line(content) + content = preserve_spaces(content) + content = convert_basic(content, epub_split_size_kb=0) + return content - def text_process_pre(self, html): + def markup_pre(self, html): pre = re.compile(r'
', re.IGNORECASE)
-        if len(pre.findall(html)) == 1:
+        if len(pre.findall(html)) >= 1:
             self.log("Running Text Processing")
-            from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
-            separate_paragraphs_single_line
             outerhtml = re.compile(r'.*?(?<=
)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL) - html = outerhtml.sub('\g', html) - html = separate_paragraphs_single_line(html) - html = preserve_spaces(html) - html = convert_basic(html, epub_split_size_kb=0) + html = outerhtml.sub(self.txt_process, html) else: # Add markup naively # TODO - find out if there are cases where there are more than one
 tag or
@@ -302,25 +309,26 @@ class PreProcessor(object):
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
-        totalwords = 0
-        totalwords = self.get_word_count(html)
+        try:
+            self.totalwords = self.get_word_count(html)
+        except:
+            self.log("Can't get wordcount")
 
-        if totalwords < 50:
+        if 0 < self.totalwords < 50:
             self.log("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and 

tags so the line_length and no_markup functions work correctly html = self.arrange_htm_line_endings(html) - ###### Check Markup ###### # # some lit files don't have any

tags or equivalent (generally just plain text between #

 tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
-            # check if content is in pre tags, use txt processor to mark up if so
-            html = self.text_process_pre(html)
+            # markup using text processing
+            html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', True):
@@ -338,7 +346,7 @@ class PreProcessor(object):
         # detect chapters/sections to match xpath or splitting logic
 
         if getattr(self.extra_opts, 'markup_chapter_headings', True):
-            html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
+            html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
 
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
@@ -383,8 +391,6 @@ class PreProcessor(object):
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
             html = dehyphenator(html,'html_cleanup', length)
-            # delete soft hyphens
-            html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters:
@@ -392,13 +398,14 @@ class PreProcessor(object):
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) + # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) html = doubleheading.sub('\g'+'\n'+'', html) - if getattr(self.extra_opts, 'dehyphenate', True): + if getattr(self.extra_opts, 'format_scene_breaks', True): # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) From a44d29e840acd0eb14b43093e0a4c178da4a69a6 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 20:13:51 +0800 Subject: [PATCH 3/4] only run cleanup_markup when required, begin markup_chapters rewrite --- src/calibre/ebooks/conversion/utils.py | 35 +++++++++++++++----------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 99685e90d1..ec175061cc 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -25,6 +25,16 @@ class PreProcessor(object): self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) + self.chapter_types = [ + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings", 'common'], + [r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines + [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters + [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles + [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings", 'plain_number'], # Numeric Chapters, no dot or colon + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters + ] + def is_pdftohtml(self, src): return '' in src[:1000] @@ -163,18 +173,8 @@ class PreProcessor(object): default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" - chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"], - [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines - [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters - [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering - [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles - [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon - [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters - ] - # Start with most typical chapter headings, get more aggressive until one works - for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: + for [chapter_type, lookahead_ignorecase, log_message, type_name] in self.chapter_types: if self.html_preprocess_sections >= self.min_chapters: break full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close @@ -303,6 +303,12 @@ class PreProcessor(object): else: return False + def cleanup_required(self): + for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']: + if getattr(self.extra_opts, option, False): + return True + return False + def __call__(self, html): self.log("********* Preprocessing HTML *********") @@ -333,8 +339,9 @@ class PreProcessor(object): # Replace series of non-breaking spaces with text-indent if getattr(self.extra_opts, 'fix_indents', True): html = self.fix_nbsp_indents(html) - - html = self.cleanup_markup(html) + + if self.cleanup_required(): + html = self.cleanup_markup(html) # ADE doesn't render
, change to empty paragraphs #html = re.sub(']*>', u'

\u00a0

', html) @@ -393,7 +400,7 @@ class PreProcessor(object): html = dehyphenator(html,'html_cleanup', length) # If still no sections after unwrapping mark split points on lines with no punctuation - if self.html_preprocess_sections < self.min_chapters: + if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False): self.log("Looking for more split points based on punctuation," " currently have " + unicode(self.html_preprocess_sections)) chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) From 1301fe69d16e452644944efbd2447f61fd6fe4fb Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 20:53:16 +0800 Subject: [PATCH 4/4] started multi-pass chapter analysis --- src/calibre/ebooks/conversion/utils.py | 71 +++++++++++++++++--------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ec175061cc..2a88d371cc 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -21,20 +21,12 @@ class PreProcessor(object): self.deleted_nbsps = False self.totalwords = 0 self.min_chapters = 1 + self.chapters_no_title = 0 + self.chapters_with_title = 0 self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) - self.chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings", 'common'], - [r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines - [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters - [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering - [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles - [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings", 'plain_number'], # Numeric Chapters, no dot or colon - [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters - ] - def is_pdftohtml(self, src): return '' in src[:1000] @@ -60,6 +52,14 @@ class PreProcessor(object): " section markers based on punctuation. - " + unicode(chap)) return '<'+styles+' style="page-break-before:always">'+chap + def analyze_title_matches(self, match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.chapters_no_title = self.chapters_no_title + 1 + else: + self.chapters_with_title = self.chapters_with_title + 1 + def insert_indent(self, match): pstyle = match.group('formatting') span = match.group('span') @@ -173,20 +173,43 @@ class PreProcessor(object): default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" - # Start with most typical chapter headings, get more aggressive until one works - for [chapter_type, lookahead_ignorecase, log_message, type_name] in self.chapter_types: - if self.html_preprocess_sections >= self.min_chapters: - break - full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) - if lookahead_ignorecase: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - else: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close - chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - html = chapdetect.sub(self.chapter_head, html) + chapter_types = [ + [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'], + [r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines + [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters + [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles + [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters + ] + + def recurse_patterns(html, analyze): + # Start with most typical chapter headings, get more aggressive until one works + for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types: + if self.html_preprocess_sections >= self.min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + if analyze: + hits = len(chapdetect.findall(html)) + print unicode(type_name)+" had "+unicode(hits)+" hits" + chapdetect.sub(self.analyze_title_matches, html) + print unicode(self.chapters_no_title)+" chapters with no title" + print unicode(self.chapters_with_title)+" chapters with titles" + else: + html = chapdetect.sub(self.chapter_head, html) + return html + + recurse_patterns(html, True) + html = recurse_patterns(html, False) words_per_chptr = wordcount if words_per_chptr > 0 and self.html_preprocess_sections > 0: