diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index a12dbd48e1..48b965f624 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -523,7 +523,7 @@ OptionRecommendation(name='delete_blank_paragraphs', OptionRecommendation(name='format_scene_breaks', recommended_value=False, level=OptionRecommendation.LOW, - help=_('Detects left aligned scene break markers and center aligns them. ' + help=_('left aligned scene break markers are center aligned. ' 'Replace soft scene breaks that use multiple blank lines with' 'horizontal rules.')), diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 417f3a1e5b..2a88d371cc 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -18,6 +18,14 @@ class PreProcessor(object): self.html_preprocess_sections = 0 self.found_indents = 0 self.extra_opts = extra_opts + self.deleted_nbsps = False + self.totalwords = 0 + self.min_chapters = 1 + self.chapters_no_title = 0 + self.chapters_with_title = 0 + self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) + self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) def is_pdftohtml(self, src): return '' in src[:1000] @@ -44,6 +52,14 @@ class PreProcessor(object): " section markers based on punctuation. - " + unicode(chap)) return '<'+styles+' style="page-break-before:always">'+chap + def analyze_title_matches(self, match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.chapters_no_title = self.chapters_no_title + 1 + else: + self.chapters_with_title = self.chapters_with_title + 1 + def insert_indent(self, match): pstyle = match.group('formatting') span = match.group('span') @@ -120,7 +136,6 @@ class PreProcessor(object): ''' # Typical chapters are between 2000 and 7000 words, use the larger number to decide the # minimum of chapters to search for - self.min_chapters = 1 if wordcount > 7000: self.min_chapters = int(ceil(wordcount / 7000.)) #print "minimum chapters required are: "+str(self.min_chapters) @@ -159,29 +174,42 @@ class PreProcessor(object): default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"], - [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines - [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters - [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering - [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles - [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon - [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters + [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'], + [r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines + [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters + [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles + [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters ] - # Start with most typical chapter headings, get more aggressive until one works - for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: - if self.html_preprocess_sections >= self.min_chapters: - break - full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) - if lookahead_ignorecase: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - else: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close - chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - html = chapdetect.sub(self.chapter_head, html) + def recurse_patterns(html, analyze): + # Start with most typical chapter headings, get more aggressive until one works + for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types: + if self.html_preprocess_sections >= self.min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + if analyze: + hits = len(chapdetect.findall(html)) + print unicode(type_name)+" had "+unicode(hits)+" hits" + chapdetect.sub(self.analyze_title_matches, html) + print unicode(self.chapters_no_title)+" chapters with no title" + print unicode(self.chapters_with_title)+" chapters with titles" + else: + html = chapdetect.sub(self.chapter_head, html) + return html + + recurse_patterns(html, True) + html = recurse_patterns(html, False) words_per_chptr = wordcount if words_per_chptr > 0 and self.html_preprocess_sections > 0: @@ -192,35 +220,48 @@ class PreProcessor(object): def punctuation_unwrap(self, length, content, format): ''' Unwraps lines based on line length and punctuation - supports range of potential html markup and text files + supports a range of html markup and text files ''' # define the pieces of the regex lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?" + em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])" + soft_hyphen = "\xad" + line_ending = "\s*\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" - line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" + line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*" txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" unwrap_regex = lookahead+line_ending+blanklines+line_opening + em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening + shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening + if format == 'txt': unwrap_regex = lookahead+txt_line_wrap + em_en_unwrap_regex = em_en_lookahead+txt_line_wrap + shy_unwrap_regex = soft_hyphen+txt_line_wrap unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) + em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) + content = unwrap.sub(' ', content) + content = em_en_unwrap.sub('', content) return content + def txt_process(self, match): + from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ + separate_paragraphs_single_line + content = match.group('text') + content = separate_paragraphs_single_line(content) + content = preserve_spaces(content) + content = convert_basic(content, epub_split_size_kb=0) + return content - def text_process_pre(self, html): + def markup_pre(self, html): pre = re.compile(r'
', re.IGNORECASE)
-        if len(pre.findall(html)) == 1:
+        if len(pre.findall(html)) >= 1:
             self.log("Running Text Processing")
-            from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
-            separate_paragraphs_single_line
             outerhtml = re.compile(r'.*?(?<=
)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL) - html = outerhtml.sub('\g', html) - html = separate_paragraphs_single_line(html) - html = preserve_spaces(html) - html = convert_basic(html, epub_split_size_kb=0) + html = outerhtml.sub(self.txt_process, html) else: # Add markup naively # TODO - find out if there are cases where there are more than one
 tag or
@@ -253,142 +294,153 @@ class PreProcessor(object):
         html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*\s*){0,2}\s*", " ", html)
         html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
+        self.deleted_nbsps = True
         return html
 
+    def analyze_line_endings(self, html):
+        '''
+        determines the type of html line ending used most commonly in a document
+        use before calling docanalysis functions
+        '''
+        paras_reg = re.compile(']*>', re.IGNORECASE)
+        spans_reg = re.compile(']*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                return 'spanned_html'
+            else:
+                return 'html'
+        else:
+            return 'html'
+
+    def analyze_blanks(self, html):
+        blanklines = self.blankreg.findall(html)
+        lines = self.linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
+                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
+                    
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                return True
+            else:
+                return False
+
+    def cleanup_required(self):
+        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
+            if getattr(self.extra_opts, option, False):
+                return True
+        return False
+
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
-        totalwords = 0
-        totalwords = self.get_word_count(html)
+        try:
+            self.totalwords = self.get_word_count(html)
+        except:
+            self.log("Can't get wordcount")
 
-        if totalwords < 50:
+        if 0 < self.totalwords < 50:
             self.log("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and 

tags so the line_length and no_markup functions work correctly html = self.arrange_htm_line_endings(html) - ###### Check Markup ###### # # some lit files don't have any

tags or equivalent (generally just plain text between #

 tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
-            # check if content is in pre tags, use txt processor to mark up if so
-            html = self.text_process_pre(html)
+            # markup using text processing
+            html = self.markup_pre(html)
 
-        ###### Mark Indents/Cleanup ######
-        #
         # Replace series of non-breaking spaces with text-indent
-        html = self.fix_nbsp_indents(html)
-        
-        html = self.cleanup_markup(html)
+        if getattr(self.extra_opts, 'fix_indents', True):
+            html = self.fix_nbsp_indents(html)
+
+        if self.cleanup_required():
+            html = self.cleanup_markup(html)
 
         # ADE doesn't render 
, change to empty paragraphs #html = re.sub(']*>', u'

\u00a0

', html) - # If more than 40% of the lines are empty paragraphs and the user has enabled delete - # blank paragraphs then delete blank lines to clean up spacing - linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) - blanklines = blankreg.findall(html) - lines = linereg.findall(html) - blanks_between_paragraphs = False - print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', False)) - if len(lines) > 1: - self.log("There are " + unicode(len(blanklines)) + " blank lines. " + - unicode(float(len(blanklines)) / float(len(lines))) + " percent blank") - - if float(len(blanklines)) / float(len(lines)) > 0.40: - blanks_between_paragraphs = True - print "blanks between paragraphs is marked True" - else: - blanks_between_paragraphs = False + # Determine whether the document uses interleaved blank lines + blanks_between_paragraphs = self.analyze_blanks(html) #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic - # - html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) + if getattr(self.extra_opts, 'markup_chapter_headings', True): + html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs) + # If more than 40% of the lines are empty paragraphs and the user has enabled delete + # blank paragraphs then delete blank lines to clean up spacing if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log("deleting blank lines") - html = multi_blank.sub('\n

', html) - html = blankreg.sub('', html) + html = self.multi_blank.sub('\n

', html) + html = self.blankreg.sub('', html) ###### Unwrap lines ###### - # - # Some OCR sourced files have line breaks in the html using a combination of span & p tags - # span are used for hard line breaks, p for new paragraphs. Determine which is used so - # that lines can be un-wrapped across page boundaries - paras_reg = re.compile(']*>', re.IGNORECASE) - spans_reg = re.compile(']*>', re.IGNORECASE) - paras = len(paras_reg.findall(html)) - spans = len(spans_reg.findall(html)) - if spans > 1: - if float(paras) / float(spans) < 0.75: - format = 'spanned_html' - else: - format = 'html' - else: - format = 'html' - # Check Line histogram to determine if the document uses hard line breaks, If 50% or - # more of the lines break in the same region of the document then unwrapping is required - docanalysis = DocAnalysis(format, html) - hardbreaks = docanalysis.line_histogram(.50) - self.log("Hard line breaks check returned "+unicode(hardbreaks)) - # Calculate Length - unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) - length = docanalysis.line_length(unwrap_factor) - self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") - # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor - if hardbreaks or unwrap_factor < 0.4: - self.log("Unwrapping required, unwrapping Lines") - # Unwrap em/en dashes - html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html) - # Dehyphenate - self.log("Unwrapping/Removing hyphens") - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html', length) - self.log("Done dehyphenating") - # Unwrap lines using punctation and line length - #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - html = self.punctuation_unwrap(length, html, 'html') - #check any remaining hyphens, but only unwrap if there is a match - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html_cleanup', length) - else: - # dehyphenate in cleanup mode to fix anything previous conversions/editing missed - self.log("Cleaning up hyphenation") - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html_cleanup', length) - self.log("Done dehyphenating") + if getattr(self.extra_opts, 'unwrap_lines', True): + # Determine line ending type + # Some OCR sourced files have line breaks in the html using a combination of span & p tags + # span are used for hard line breaks, p for new paragraphs. Determine which is used so + # that lines can be un-wrapped across page boundaries + format = self.analyze_line_endings(html) - # delete soft hyphens - html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) + # Check Line histogram to determine if the document uses hard line breaks, If 50% or + # more of the lines break in the same region of the document then unwrapping is required + docanalysis = DocAnalysis(format, html) + hardbreaks = docanalysis.line_histogram(.50) + self.log("Hard line breaks check returned "+unicode(hardbreaks)) + + # Calculate Length + unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) + length = docanalysis.line_length(unwrap_factor) + self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") + + # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor + if hardbreaks or unwrap_factor < 0.4: + self.log("Unwrapping required, unwrapping Lines") + # Dehyphenate with line length limiters + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html', length) + html = self.punctuation_unwrap(length, html, 'html') + #check any remaining hyphens, but only unwrap if there is a match + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) + + if getattr(self.extra_opts, 'dehyphenate', True): + # dehyphenate in cleanup mode to fix anything previous conversions/editing missed + self.log("Fixing hyphenated content") + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) # If still no sections after unwrapping mark split points on lines with no punctuation - if self.html_preprocess_sections < self.min_chapters: + if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False): self.log("Looking for more split points based on punctuation," " currently have " + unicode(self.html_preprocess_sections)) chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) + # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) html = doubleheading.sub('\g'+'\n'+'', html) - # put back non-breaking spaces in empty paragraphs to preserve original formatting - html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) + if getattr(self.extra_opts, 'format_scene_breaks', True): + # Center separator lines + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) - # Center separator lines - html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) + if self.deleted_nbsps: + # put back non-breaking spaces in empty paragraphs to preserve original formatting + html = self.blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html