From 74243ac0b9277f5468bcdb58e54b397862da76e1 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 10 Jan 2011 17:07:40 +0800 Subject: [PATCH 001/118] preprocess tweaks --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/conversion/utils.py | 4 ++-- src/calibre/ebooks/txt/processor.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 08a46cb8d9..f994888f19 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -360,7 +360,7 @@ class HTMLPreProcessor(object): (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), # Center separator lines - (re.compile(u'
\s*(?P([*#•✦]+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), + (re.compile(u'
\s*(?P([*#•✦=]+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 52d1bcc619..9177b5e53b 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -155,9 +155,9 @@ class PreProcessor(object): chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"], + [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters - [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters @@ -357,6 +357,6 @@ class PreProcessor(object): html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) # Center separator lines - html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) return html diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 6a1a106681..ef9920185f 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -147,7 +147,7 @@ def detect_paragraph_type(txt): if .15 <= print_percent <= .75: return 'print' elif .15 <= block_percent <= .75: - return 'block' + return 'block' # Assume unformatted text with hardbreaks if nothing else matches return 'unformatted' From 9832b7118b592679541ab357de02e426e1f48a19 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 11 Jan 2011 11:27:25 +0800 Subject: [PATCH 002/118] chapter detection tweaks --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 9177b5e53b..cfa57a28c3 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -151,11 +151,11 @@ class PreProcessor(object): n_lookahead_open = "\s+(?!" n_lookahead_close = ")" - default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" + default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\:\'\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"], - [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering + [r"([A-Z-]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles From 60b5d3853fc41cd17111287bfa2fff9b6bcab096 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Fri, 14 Jan 2011 07:24:07 +0900 Subject: [PATCH 003/118] fix nikkei_sub economy --- resources/recipes/nikkei_sub_economy.recipe | 3 +++ 1 file changed, 3 insertions(+) diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe index 2dd8f1add8..8e7a68dfe7 100644 --- a/resources/recipes/nikkei_sub_economy.recipe +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe): {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, {'class':"cmn-article_keyword cmn-clearfix"}, {'class':"cmn-print_headline cmn-clearfix"}, + {'class':"cmn-article_list"}, + dict(id="ABOUT-NIKKEI"), + {'class':"cmn-sub_market"}, ] remove_tags_after = {'class':"cmn-pr_list"} From b0a9c9659cda37d6cda41b22cd765713fb29f308 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 13 Jan 2011 19:58:09 -0500 Subject: [PATCH 004/118] Add heuristic options. Remove options that they replace. --- src/calibre/ebooks/conversion/cli.py | 23 ++++- src/calibre/ebooks/conversion/plumber.py | 117 ++++++++++++++--------- 2 files changed, 92 insertions(+), 48 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 3178fe1b43..f825776c9c 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -126,8 +126,21 @@ def add_pipeline_options(parser, plumber): 'margin_top', 'margin_left', 'margin_right', 'margin_bottom', 'change_justification', 'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size', - 'asciiize', 'remove_header', 'header_regex', - 'remove_footer', 'footer_regex', + 'asciiize', + ] + ), + + 'HEURISTICS' : ( + _('Modify the document text and strucutre using common patterns.'), + [ + 'enable_heuristics', 'markup_chapter_headings', + 'italicize_common_cases', 'fix_indents', + 'html_unwrap_factor', 'unwrap_lines', + 'delete_blank_paragraphs', 'format_scene_breaks', + 'dehyphenate', + 'sr1_search', 'sr1_replace', + 'sr2_search', 'sr2_replace', + 'sr3_search', 'sr3_replace', ] ), @@ -137,7 +150,6 @@ def add_pipeline_options(parser, plumber): 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', - 'preprocess_html', 'html_unwrap_factor', ] ), @@ -164,8 +176,9 @@ def add_pipeline_options(parser, plumber): } - group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION', - 'TABLE OF CONTENTS', 'METADATA', 'DEBUG'] + group_order = ['', 'LOOK AND FEEL', 'HEURISTICS', + 'STRUCTURE DETECTION', 'TABLE OF CONTENTS', + 'METADATA', 'DEBUG'] for group in group_order: desc, options = groups[group] diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 9b22fb46ec..3ec4e104f9 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata', ) ), -OptionRecommendation(name='preprocess_html', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Attempt to detect and correct hard line breaks and other ' - 'problems in the source file. This may make things worse, so use ' - 'with care.' - ) - ), - -OptionRecommendation(name='html_unwrap_factor', - recommended_value=0.40, level=OptionRecommendation.LOW, - help=_('Scale used to determine the length at which a line should ' - 'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The ' - 'default is 0.40, just below the median line length. This will unwrap typical books ' - ' with hard line breaks, but should be reduced if the line length is variable.' - ) - ), - OptionRecommendation(name='smarten_punctuation', recommended_value=False, level=OptionRecommendation.LOW, help=_('Convert plain quotes, dashes and ellipsis to their ' @@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation', ) ), -OptionRecommendation(name='remove_header', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Use a regular expression to try and remove the header.' - ) - ), - -OptionRecommendation(name='header_regex', - recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', - level=OptionRecommendation.LOW, - help=_('The regular expression to use to remove the header.' - ) - ), - -OptionRecommendation(name='remove_footer', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Use a regular expression to try and remove the footer.' - ) - ), - -OptionRecommendation(name='footer_regex', - recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', - level=OptionRecommendation.LOW, - help=_('The regular expression to use to remove the footer.' - ) - ), - OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, short_switch='m', @@ -526,7 +483,81 @@ OptionRecommendation(name='pubdate', OptionRecommendation(name='timestamp', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the book timestamp (used by the date column in calibre).')), + +OptionRecommendation(name='enable_heuristics', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Enable heurisic processing. This option must be set for any ' + 'heuristic processing to take place.')), +OptionRecommendation(name='markup_chapter_headings', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Detect chapter headings and sub headings. Change ' + 'them to h1 and h2 tags.')), + +OptionRecommendation(name='italicize_common_cases', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Look for common words and patterns that denote ' + 'italics and italicize them.')), + +OptionRecommendation(name='fix_indents', + recommended_value=True, level=OptionRecommendation.LOW, + help=_('Turn indentation created from multiple   entities ' + 'into CSS indents.')), + +OptionRecommendation(name='html_unwrap_factor', + recommended_value=0.40, level=OptionRecommendation.LOW, + help=_('Scale used to determine the length at which a line should ' + 'be unwrapped. Valid values are a decimal between 0 and 1. The ' + 'default is 0.4, just below the median line length.')), + +OptionRecommendation(name='unwrap_lines', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Unwrap lines.')), + +OptionRecommendation(name='delete_blank_paragraphs', + recommended_value=True, level=OptionRecommendation.LOW, + help=_('Remove empyt paragraphs from the document')), + +OptionRecommendation(name='format_scene_breaks', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Replace soft scene breaks that use multiple blank lines ' + 'with horizontal rules.')), + +OptionRecommendation(name='dehyphenate', + recommended_value=True, level=OptionRecommendation.LOW, + help=_('Combine words that are separated by a hyphen. ' + 'This is for cases where a word is hyphenated across ' + 'two lines to denote the characters from a single word.')), + +OptionRecommendation(name='sr1_search', + recommended_value='', level=OptionRecommendation.LOW, + help=_('Search pattern (regular expression) to be replaced with ' + 'sr1-replace.')), + +OptionRecommendation(name='sr1_replace', + recommended_value='', level=OptionRecommendation.LOW, + help=_('Replace characters (can be lambda expression) to ' + 'replace the text found with sr1-search.')), + +OptionRecommendation(name='sr2_search', + recommended_value='', level=OptionRecommendation.LOW, + help=_('Search pattern (regular expression) to be replaced with ' + 'sr2-replace.')), + +OptionRecommendation(name='sr2_replace', + recommended_value='', level=OptionRecommendation.LOW, + help=_('Replace characters (can be lambda expression) to ' + 'replace the text found with sr2-search.')), + +OptionRecommendation(name='sr3_search', + recommended_value='', level=OptionRecommendation.LOW, + help=_('Search pattern (regular expression) to be replaced with ' + 'sr3-replace.')), + +OptionRecommendation(name='sr3_replace', + recommended_value='', level=OptionRecommendation.LOW, + help=_('Replace characters (can be lambda expression) to ' + 'replace the text found with sr3-search.')), ] # }}} From 8676ddd30fba0df90eb62e7c1c84c3fd3dc13f39 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 14 Jan 2011 18:12:17 +0800 Subject: [PATCH 005/118] updated heuristics help messages --- src/calibre/ebooks/conversion/plumber.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 3ec4e104f9..50d0646c7d 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -491,8 +491,8 @@ OptionRecommendation(name='enable_heuristics', OptionRecommendation(name='markup_chapter_headings', recommended_value=False, level=OptionRecommendation.LOW, - help=_('Detect chapter headings and sub headings. Change ' - 'them to h1 and h2 tags.')), + help=_('Detect unformatted chapter headings and sub headings. Change ' + 'them to h2 and h3 tags.')), OptionRecommendation(name='italicize_common_cases', recommended_value=False, level=OptionRecommendation.LOW, @@ -508,26 +508,30 @@ OptionRecommendation(name='html_unwrap_factor', recommended_value=0.40, level=OptionRecommendation.LOW, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' - 'default is 0.4, just below the median line length.')), + 'default is 0.4, just below the median line length. If only a ' + 'few lines in the document require unwrapping this value should ' + 'be reduced')), OptionRecommendation(name='unwrap_lines', recommended_value=False, level=OptionRecommendation.LOW, - help=_('Unwrap lines.')), + help=_('Unwrap lines using punctuation and other formatting clues.')), OptionRecommendation(name='delete_blank_paragraphs', recommended_value=True, level=OptionRecommendation.LOW, - help=_('Remove empyt paragraphs from the document')), + help=_('Remove empty paragraphs from the document when they exist between ' + 'every other paragraph')), OptionRecommendation(name='format_scene_breaks', recommended_value=False, level=OptionRecommendation.LOW, - help=_('Replace soft scene breaks that use multiple blank lines ' - 'with horizontal rules.')), + help=_('Detects left aligned scene break markers and center aligns them. ' + 'Replace soft scene breaks that use multiple blank lines with' + 'horizontal rules.')), OptionRecommendation(name='dehyphenate', recommended_value=True, level=OptionRecommendation.LOW, - help=_('Combine words that are separated by a hyphen. ' - 'This is for cases where a word is hyphenated across ' - 'two lines to denote the characters from a single word.')), + help=_('Analyses hyphenated words throughout the document. The ' + 'document itself is used as a dictionary to determine whether hyphens ' + 'should be retained or removed.')), OptionRecommendation(name='sr1_search', recommended_value='', level=OptionRecommendation.LOW, From 90c978bb1076e7afb7843df96959cc365d17332d Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 14 Jan 2011 21:33:47 +0800 Subject: [PATCH 006/118] tied enable heuristics to preprocess, moved various pieces to functions --- src/calibre/customize/conversion.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 4 +- src/calibre/ebooks/conversion/utils.py | 117 ++++++++++++++--------- src/calibre/ebooks/lit/input.py | 2 +- 4 files changed, 78 insertions(+), 47 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index ec83600a49..a9e573ffa0 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() - def preprocess_html(self, opts, html): + def heuristics(self, opts, html): ''' This method is called by the conversion pipeline on all HTML before it is parsed. It is meant to be used to do any required preprocessing on diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 50d0646c7d..a40c17a743 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, - opts.preprocess_html, opts) + html_preprocessor = HTMLPreProcessor(input_plugin.heuristics, + opts.enable_heuristics, opts) if not encoding: encoding = None oeb = OEBBook(log, html_preprocessor, diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index dac93fa2e2..44d4235b6c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -113,6 +113,11 @@ class PreProcessor(object): return wordcount.words def markup_chapters(self, html, wordcount, blanks_between_paragraphs): + ''' + Searches for common chapter headings throughout the document + attempts multiple patterns based on likelihood of a match + with minimum false positives. Exits after finding a successful pattern + ''' # Typical chapters are between 2000 and 7000 words, use the larger number to decide the # minimum of chapters to search for self.min_chapters = 1 @@ -185,6 +190,10 @@ class PreProcessor(object): return html def punctuation_unwrap(self, length, content, format): + ''' + Unwraps lines based on line length and punctuation + supports range of potential html markup and text files + ''' # define the pieces of the regex lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?" @@ -201,53 +210,38 @@ class PreProcessor(object): return content - def __call__(self, html): - self.log("********* Preprocessing HTML *********") + def text_process_pre(self, html): + pre = re.compile(r'
', re.IGNORECASE)
+        if len(pre.findall(html)) == 1:
+            self.log("Running Text Processing")
+            from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+            separate_paragraphs_single_line
+            outerhtml = re.compile(r'.*?(?<=
)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL) + html = outerhtml.sub('\g', html) + html = separate_paragraphs_single_line(html) + html = preserve_spaces(html) + html = convert_basic(html, epub_split_size_kb=0) + else: + # Add markup naively + # TODO - find out if there are cases where there are more than one
 tag or
+            # other types of unmarked html and handle them in some better fashion
+            add_markup = re.compile('(?)(\n)')
+            html = add_markup.sub('

\n

', html) + return html - # Count the words in the document to estimate how many chapters to look for and whether - # other types of processing are attempted - totalwords = 0 - totalwords = self.get_word_count(html) - - if totalwords < 50: - self.log("not enough text, not preprocessing") - return html - - # Arrange line feeds and

tags so the line_length and no_markup functions work correctly + def arrange_htm_line_endings(self, html): html = re.sub(r"\s*p|div)>", ""+">\n", html) html = re.sub(r"\s*<(?Pp|div)(?P