From 06cbaca2e6160b83467c16c5737462a4c312816a Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 19 Jan 2011 22:21:11 +0800 Subject: [PATCH 1/5] start at enabling some heuristics options by default --- src/calibre/ebooks/conversion/cli.py | 1 + src/calibre/ebooks/conversion/plumber.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index b5c057b0f9..8cd4f124d5 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -69,6 +69,7 @@ def option_recommendation_to_cli_option(add_option, rec): opt = rec.option switches = ['-'+opt.short_switch] if opt.short_switch else [] switches.append('--'+opt.long_switch) + flip_switches = ['italicize_common_cases', 'markup_chapter_headings', 'unwrap_lines', 'dehyphenate', 'fix_indents'] attrs = dict(dest=opt.name, help=opt.help, choices=opt.choices, default=rec.recommended_value) if isinstance(rec.recommended_value, type(True)): diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 04ee892c19..7dd977cd7b 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -490,19 +490,19 @@ OptionRecommendation(name='enable_heuristics', 'heuristic processing to take place.')), OptionRecommendation(name='markup_chapter_headings', - recommended_value=False, level=OptionRecommendation.LOW, + recommended_value=True, level=OptionRecommendation.LOW, help=_('Detect unformatted chapter headings and sub headings. Change ' 'them to h2 and h3 tags. This setting will not create a TOC, ' 'but can be used in conjunction with structure detection to create ' 'one.')), OptionRecommendation(name='italicize_common_cases', - recommended_value=False, level=OptionRecommendation.LOW, + recommended_value=True, level=OptionRecommendation.LOW, help=_('Look for common words and patterns that denote ' 'italics and italicize them.')), OptionRecommendation(name='fix_indents', - recommended_value=False, level=OptionRecommendation.LOW, + recommended_value=True, level=OptionRecommendation.LOW, help=_('Turn indentation created from multiple non-breaking space entities ' 'into CSS indents.')), @@ -515,7 +515,7 @@ OptionRecommendation(name='html_unwrap_factor', 'be reduced')), OptionRecommendation(name='unwrap_lines', - recommended_value=False, level=OptionRecommendation.LOW, + recommended_value=True, level=OptionRecommendation.LOW, help=_('Unwrap lines using punctuation and other formatting clues.')), OptionRecommendation(name='delete_blank_paragraphs', @@ -530,7 +530,7 @@ OptionRecommendation(name='format_scene_breaks', 'horizontal rules.')), OptionRecommendation(name='dehyphenate', - recommended_value=False, level=OptionRecommendation.LOW, + recommended_value=True, level=OptionRecommendation.LOW, help=_('Analyze hyphenated words throughout the document. The ' 'document itself is used as a dictionary to determine whether hyphens ' 'should be retained or removed.')), From 27b83959356754d8c0516d9d1da174acb3b0cf2f Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 19 Jan 2011 23:09:18 +0800 Subject: [PATCH 2/5] document updates --- src/calibre/manual/conversion.rst | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index de27a5f5bb..6ec986f26a 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -260,11 +260,11 @@ The Output profile also controls the screen size. This will cause, for example, Heuristic Processing --------------------- -Heuristic Processing provides a variety of functions which can be used that try to detect and correct +Heuristic Processing provides a variety of functions which can be used to try and detect and correct common problems in poorly formatted input documents. Use these functions if your input document suffers -from bad formatting. Because these functions rely on common patterns, be aware that in some cases an +from poor formatting. Because these functions rely on common patterns, be aware that in some cases an option may lead to worse results, so use with care. As an example, several of these options will -remove all non-breaking-space entities. +remove all non-breaking-space entities, or may include false positive matches relating to the function. :guilabel:`Enable heuristic processing` This option activates |app|'s Heuristic Processing stage of the conversion pipeline. @@ -283,7 +283,7 @@ remove all non-breaking-space entities. correction, then this value should be reduced to somewhere between 0.1 and 0.2. :guilabel:`Detect and markup unformatted chapter headings and sub headings` - If your document does not have Chapter Markers and titles formatted differently from the rest of the text, + If your document does not have chapter headings and titles formatted differently from the rest of the text, |app| can use this option to attempt detection them and surround them with heading tags.

tags are used for chapter headings;

tags are used for any titles that are detected. @@ -331,21 +331,23 @@ remove all non-breaking-space entities. Some documents use a convention of defining text indents using non-breaking space entities. When this option is enabled |app| will attempt to detect this sort of formatting and convert them to a 3% text indent using css. -.. search-replace: +.. _search-replace: Search & Replace --------------------- -These options are useful primarily for conversion of PDF documents. Often, the conversion leaves -behind page headers and footers in the text. These options use regular expressions to try and detect -the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced -by the conversion pipeline. There is also a wizard to help you customize the regular expressions for -your document. These options can also be used for generic search and replace of any content by additionally -specifying a replacement expression. +These options are useful primarily for conversion of PDF documents or OCR conversions, though they can +also be used to fix many document specific problems. As an example, some conversions can leaves behind page +headers and footers in the text. These options use regular expressions to try and detect headers, footers, +or other arbitrary text and remove or replace them. Remember that they operate on the intermediate XHTML produced +by the conversion pipeline. There is a wizard to help you customize the regular expressions for +your document. Click the magic wand beside the expression box, and click the 'Test' button after composing +your search expression. Successful matches will be highlighted in Yellow. -The search works by using a python regular expression. All matched text is simply removed from -the document or replaced using the replacement pattern. You can learn more about regular expressions and -their syntax at :ref:`regexptutorial`. +The search works by using a python regular expression. All matched text is simply removed from +the document or replaced using the replacement pattern. The replacement pattern is optional, if left blank +then text matching the search pattern will be deleted from the document. You can learn more about regular expressions +and their syntax at :ref:`regexptutorial`. .. _structure-detection: From 260484f5152e0892a393be74dff1f2cdee891d36 Mon Sep 17 00:00:00 2001 From: ldolse Date: Thu, 20 Jan 2011 10:06:28 +0800 Subject: [PATCH 3/5] reduce false positives in dehyphenate --- src/calibre/ebooks/conversion/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f728bec52b..5fceeb7aed 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -201,7 +201,7 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) else: lookupword = dehyphenated - if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None: + if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) if self.verbose > 2: self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)) From 3a063ee644de603d3e3c8c327c58b1f95abfc416 Mon Sep 17 00:00:00 2001 From: ldolse Date: Thu, 20 Jan 2011 12:10:34 +0800 Subject: [PATCH 4/5] updated text input heuristics option to preserve text indents, fixed false positive case in dehyphenate --- src/calibre/ebooks/txt/input.py | 51 +++++++++++++++++---------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5b99b19e74..6ec1edb65c 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -71,21 +71,41 @@ class TXTInput(InputFormatPlugin): txt = txt.decode(ienc, 'replace') txt = _ent_pat.sub(xml_entity_to_unicode, txt) + + # Normalize line endings + txt = normalize_line_endings(txt) + + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) + + if options.formatting_type == 'heuristic': + setattr(options, 'enable_heuristics', True) + setattr(options, 'markup_chapter_headings', True) + setattr(options, 'italicize_common_cases', True) + setattr(options, 'fix_indents', True) + setattr(options, 'preserve_spaces', True) + setattr(options, 'delete_blank_paragraphs', True) + setattr(options, 'format_scene_breaks', True) + setattr(options, 'dehyphenate', True) + + # Determine the paragraph type of the document. + if options.paragraph_type == 'auto': + options.paragraph_type = detect_paragraph_type(txt) + if options.paragraph_type == 'unknown': + log.debug('Could not reliably determine paragraph type using block') + options.paragraph_type = 'block' + else: + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + # Preserve spaces will replace multiple spaces to a space # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) - # Normalize line endings - txt = normalize_line_endings(txt) - # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) - if options.formatting_type == 'auto': - options.formatting_type = detect_formatting_type(txt) - if options.formatting_type == 'markdown': log.debug('Running text though markdown conversion...') try: @@ -96,16 +116,8 @@ class TXTInput(InputFormatPlugin): elif options.formatting_type == 'textile': log.debug('Running text though textile conversion...') html = convert_textile(txt) - else: - # Determine the paragraph type of the document. - if options.paragraph_type == 'auto': - options.paragraph_type = detect_paragraph_type(txt) - if options.paragraph_type == 'unknown': - log.debug('Could not reliably determine paragraph type using block') - options.paragraph_type = 'block' - else: - log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + else: # Dehyphenate dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) @@ -129,15 +141,6 @@ class TXTInput(InputFormatPlugin): flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) - if options.formatting_type == 'heuristic': - setattr(options, 'enable_heuristics', True) - setattr(options, 'markup_chapter_headings', True) - setattr(options, 'italicize_common_cases', True) - setattr(options, 'fix_indents', True) - setattr(options, 'delete_blank_paragraphs', True) - setattr(options, 'format_scene_breaks', True) - setattr(options, 'dehyphenate', True) - from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: From 2f5fa30b8680ef69608552aa38190beb71cfaa35 Mon Sep 17 00:00:00 2001 From: ldolse Date: Thu, 20 Jan 2011 12:17:25 +0800 Subject: [PATCH 5/5] ... --- src/calibre/ebooks/conversion/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index a95bb23126..33ae61f16a 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -75,7 +75,6 @@ def option_recommendation_to_cli_option(add_option, rec): opt = rec.option switches = ['-'+opt.short_switch] if opt.short_switch else [] switches.append('--'+opt.long_switch) - flip_switches = ['italicize_common_cases', 'markup_chapter_headings', 'unwrap_lines', 'dehyphenate', 'fix_indents'] attrs = dict(dest=opt.name, help=opt.help, choices=opt.choices, default=rec.recommended_value) if isinstance(rec.recommended_value, type(True)):