From ed2b94ac9d98be1ed3564c36071b62e6335ea60d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Feb 2011 10:46:32 -0500 Subject: [PATCH 1/6] Heuristics: Tweak italicize patterns to make them more robust. --- src/calibre/ebooks/conversion/utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 63eca10714..e8e2a82949 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -149,17 +149,17 @@ class HeuristicProcessor(object): ] ITALICIZE_STYLE_PATS = [ - r'(?msu)(?<=\s)_(?P\S[^_]{0,40}?\S)?_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)/(?P\S[^/]{0,40}?\S)?/(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)~~(?P\S[^~]{0,40}?\S)?~~(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)\*(?P\S[^\*]{0,40}?\S)?\*(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)~(?P\S[^~]{0,40}?\S)?~(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)_/(?P\S[^/_]{0,40}?\S)?/_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)_\*(?P\S[^\*_]{0,40}?\S)?\*_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)\*/(?P\S[^/\*]{0,40}?\S)?/\*(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)_\*/(?P\S[^\*_]{0,40}?\S)?/\*_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)/:(?P\S[^:/]{0,40}?\S)?:/(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)\|:(?P\S[^:\|]{0,40}?\S)?:\|(?=[\s\.,\!\?])', + r'(?msu)(?<=[\s>"])_(?P[^_]+)?_', + r'(?msu)(?<=[\s>"])/(?P[^/]+)?/', + r'(?msu)(?<=[\s>"])~~(?P[^~]+)?~~', + r'(?msu)(?<=[\s>"])\*(?P[^\*]+)?\*', + r'(?msu)(?<=[\s>"])~(?P[^~]+)?~', + r'(?msu)(?<=[\s>"])_/(?P[^/_]+)?/_', + r'(?msu)(?<=[\s>"])_\*(?P[^\*_]+)?\*_', + r'(?msu)(?<=[\s>"])\*/(?P[^/\*]+)?/\*', + r'(?msu)(?<=[\s>"])_\*/(?P[^\*_]+)?/\*_', + r'(?msu)(?<=[\s>"])/:(?P[^:/]+)?:/', + r'(?msu)(?<=[\s>"])\|:(?P[^:\|]+)?:\|', ] for word in ITALICIZE_WORDS: From deee20d8f85010059019acfdfd4d6c719711ec73 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Feb 2011 11:02:00 -0500 Subject: [PATCH 2/6] TXT Output: Fix inline toc not showing all items. --- src/calibre/ebooks/txt/txtml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index c2ee3f37c5..fa7bfbb380 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -55,6 +55,7 @@ class TXTMLizer(object): self.log.info('Converting XHTML to TXT...') self.oeb_book = oeb_book self.opts = opts + self.toc_titles = [] self.toc_ids = [] self.last_was_heading = False @@ -94,8 +95,8 @@ class TXTMLizer(object): if getattr(self.opts, 'inline_toc', None): self.log.debug('Generating table of contents...') toc.append(u'%s\n\n' % _(u'Table of Contents:')) - for item in self.oeb_book.toc: - toc.append(u'* %s\n\n' % item.title) + for item in self.toc_titles: + toc.append(u'* %s\n\n' % item) return ''.join(toc) def create_flat_toc(self, nodes): @@ -103,6 +104,7 @@ class TXTMLizer(object): Turns a hierarchical list of TOC href's into a flat list. ''' for item in nodes: + self.toc_titles.append(item.title) self.toc_ids.append(item.href) self.create_flat_toc(item.nodes) From 9cdad92468b25f289f5531be56be0ec0ee32e01d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Feb 2011 12:51:47 -0500 Subject: [PATCH 3/6] TXT Input: Restructure to run dehyphenator when auto and heuristic formatting options are used. This causes textile and markdown to be dehyphenated. --- src/calibre/ebooks/txt/input.py | 47 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e1392ef732..85bd781ff8 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -77,20 +77,6 @@ class TXTInput(InputFormatPlugin): # Normalize line endings txt = normalize_line_endings(txt) - # Detect formatting - if options.formatting_type == 'auto': - options.formatting_type = detect_formatting_type(txt) - log.debug('Auto detected formatting as %s' % options.formatting_type) - - if options.formatting_type == 'heuristic': - setattr(options, 'enable_heuristics', True) - setattr(options, 'markup_chapter_headings', True) - setattr(options, 'italicize_common_cases', True) - setattr(options, 'fix_indents', True) - setattr(options, 'delete_blank_paragraphs', True) - setattr(options, 'format_scene_breaks', True) - setattr(options, 'dehyphenate', True) - # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) @@ -99,16 +85,27 @@ class TXTInput(InputFormatPlugin): options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + dehyphenate = False + if options.formatting_type in ('auto', 'heuristic'): + # Set this here because we want it to run over all + # formatting types if auto is used. + dehyphenate = True + + # Detect formatting + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) + log.debug('Auto detected formatting as %s' % options.formatting_type) + + if options.formatting_type == 'heuristic': + setattr(options, 'enable_heuristics', True) + setattr(options, 'unwrap_lines', False) # Preserve spaces will replace multiple spaces to a space # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) - # Get length for hyphen removal and punctuation unwrap - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) - # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -119,9 +116,17 @@ class TXTInput(InputFormatPlugin): elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') + if dehyphenate: + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + dehyphenator = Dehyphenator(options.verbose, log=self.log) + txt = dehyphenator(txt,'txt', length) + # Process the text using the appropriate text processor. html = '' if options.formatting_type == 'markdown': @@ -134,14 +139,8 @@ class TXTInput(InputFormatPlugin): elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) - else: log.debug('Running text through basic conversion...') - if options.formatting_type == 'heuristic': - # Dehyphenate - dehyphenator = Dehyphenator(options.verbose, log=self.log) - txt = dehyphenator(txt,'txt', length) - flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) From 2796960f420cf26ad621f137845e6db84bc3019d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Feb 2011 13:04:32 -0500 Subject: [PATCH 4/6] Heuristics: Fix issue with invalid markup from italicize patterns. --- src/calibre/ebooks/conversion/utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index e8e2a82949..c0c2ee8978 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -149,17 +149,17 @@ class HeuristicProcessor(object): ] ITALICIZE_STYLE_PATS = [ - r'(?msu)(?<=[\s>"])_(?P[^_]+)?_', - r'(?msu)(?<=[\s>"])/(?P[^/]+)?/', - r'(?msu)(?<=[\s>"])~~(?P[^~]+)?~~', - r'(?msu)(?<=[\s>"])\*(?P[^\*]+)?\*', - r'(?msu)(?<=[\s>"])~(?P[^~]+)?~', - r'(?msu)(?<=[\s>"])_/(?P[^/_]+)?/_', - r'(?msu)(?<=[\s>"])_\*(?P[^\*_]+)?\*_', - r'(?msu)(?<=[\s>"])\*/(?P[^/\*]+)?/\*', - r'(?msu)(?<=[\s>"])_\*/(?P[^\*_]+)?/\*_', - r'(?msu)(?<=[\s>"])/:(?P[^:/]+)?:/', - r'(?msu)(?<=[\s>"])\|:(?P[^:\|]+)?:\|', + r'(?msu)(?<=[\s>])_(?P[^_]+)?_', + r'(?msu)(?<=[\s>])/(?P[^/]+)?/', + r'(?msu)(?<=[\s>])~~(?P[^~]+)?~~', + r'(?msu)(?<=[\s>])\*(?P[^\*]+)?\*', + r'(?msu)(?<=[\s>])~(?P[^~]+)?~', + r'(?msu)(?<=[\s>])_/(?P[^/_]+)?/_', + r'(?msu)(?<=[\s>])_\*(?P[^\*_]+)?\*_', + r'(?msu)(?<=[\s>])\*/(?P[^/\*]+)?/\*', + r'(?msu)(?<=[\s>])_\*/(?P[^\*_]+)?/\*_', + r'(?msu)(?<=[\s>])/:(?P[^:/]+)?:/', + r'(?msu)(?<=[\s>])\|:(?P[^:\|]+)?:\|', ] for word in ITALICIZE_WORDS: From 1f708746d0e6e62234f9a2a9a96cd5bcf73bfc94 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Feb 2011 13:35:41 -0500 Subject: [PATCH 5/6] TXT Input: Fix bug where spaces were not retained properly. Fix bug where spaces were replaced with entities (this should only have happened at the beginning of lines). Add option to remove indents. --- src/calibre/ebooks/txt/input.py | 8 +++++++- src/calibre/ebooks/txt/processor.py | 12 ++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 85bd781ff8..b1374bbeec 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile + normalize_line_endings, convert_textile, remove_indents from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -47,6 +47,9 @@ class TXTInput(InputFormatPlugin): OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), + OptionRecommendation(name='txt_in_remove_indents', recommended_value=False, + help=_('Normally extra space at the beginning of lines is retained. ' + 'With this option they will be removed.')), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) @@ -101,6 +104,9 @@ class TXTInput(InputFormatPlugin): setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) + if options.txt_in_remove_indents: + txt = remove_indents(txt) + # Preserve spaces will replace multiple spaces to a space # followed by the   entity. if options.preserve_spaces: diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 546d3f1842..987d7cdc73 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -24,14 +24,14 @@ def clean_txt(txt): # all line breaks with \n. txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) - # Replace whitespace at the beginning of the list with   - txt = re.sub('(?m)(?P[ ]+)', lambda mo: ' ' * mo.groups('space').count(' '), txt) - txt = re.sub('(?m)(?P[\t]+)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt) + # Replace whitespace at the beginning of the line with   + txt = re.sub('(?m)(?P^[ ]+)(?=.)', lambda mo: ' ' * mo.groups('space').count(' '), txt) + txt = re.sub('(?m)(?P^[\t]+)(?=.)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt) # Condense redundant spaces txt = re.sub('[ ]{2,}', ' ', txt) - # Remove blank lines from the beginning and end of the document. + # Remove blank space from the beginning and end of the document. txt = re.sub('^\s+(?=.)', '', txt) txt = re.sub('(?<=.)\s+$', '', txt) # Remove excessive line breaks. @@ -107,6 +107,10 @@ def preserve_spaces(txt): txt = txt.replace('\t', '    ') return txt +def remove_indents(txt): + txt = re.sub('(?miu)^\s+', '', txt) + return txt + def opf_writer(path, opf_name, manifest, spine, mi): opf = OPFCreator(path, mi) opf.create_manifest(manifest) From a0fd28d9660f56ed2a37abe7b185c53f65e1dff7 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 5 Feb 2011 13:46:15 -0500 Subject: [PATCH 6/6] TXT Input GUI: Add remove indents option. Restructure options to make them grouped cleaner. --- src/calibre/gui2/convert/txt_input.py | 3 +- src/calibre/gui2/convert/txt_input.ui | 127 ++++++++++++++++++-------- 2 files changed, 91 insertions(+), 39 deletions(-) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 62672cc0f9..acdf5f43c0 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', + 'preserve_spaces', 'txt_in_remove_indents']) self.db, self.book_id = db, book_id for x in get_option('paragraph_type').option.choices: self.opt_paragraph_type.addItem(x) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 6cbd68135f..211b03294a 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -7,57 +7,95 @@ 0 0 518 - 300 + 353 Form - - - - - Paragraph style: + + + + + Structure + + + + + + 0 + 0 + + + + Paragraph style: + + + + + + + + 0 + 0 + + + + + + + + + 0 + 0 + + + + Formatting style: + + + + + + + + 0 + 0 + + + + + - - - - - - - Preserve &spaces + + + + Common + + + + + Preserve &spaces + + + + + + + Remove indents at the beginning of lines + + + + - - - - Qt::Vertical - - - - 20 - 213 - - - - - - - - - - - Formatting style: - - - - + - Markdown Options + Markdown @@ -83,6 +121,19 @@ + + + + Qt::Vertical + + + + 20 + 213 + + + +