diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 63eca10714..c0c2ee8978 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -149,17 +149,17 @@ class HeuristicProcessor(object): ] ITALICIZE_STYLE_PATS = [ - r'(?msu)(?<=\s)_(?P\S[^_]{0,40}?\S)?_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)/(?P\S[^/]{0,40}?\S)?/(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)~~(?P\S[^~]{0,40}?\S)?~~(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)\*(?P\S[^\*]{0,40}?\S)?\*(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)~(?P\S[^~]{0,40}?\S)?~(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)_/(?P\S[^/_]{0,40}?\S)?/_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)_\*(?P\S[^\*_]{0,40}?\S)?\*_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)\*/(?P\S[^/\*]{0,40}?\S)?/\*(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)_\*/(?P\S[^\*_]{0,40}?\S)?/\*_(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)/:(?P\S[^:/]{0,40}?\S)?:/(?=[\s\.,\!\?])', - r'(?msu)(?<=\s)\|:(?P\S[^:\|]{0,40}?\S)?:\|(?=[\s\.,\!\?])', + r'(?msu)(?<=[\s>])_(?P[^_]+)?_', + r'(?msu)(?<=[\s>])/(?P[^/]+)?/', + r'(?msu)(?<=[\s>])~~(?P[^~]+)?~~', + r'(?msu)(?<=[\s>])\*(?P[^\*]+)?\*', + r'(?msu)(?<=[\s>])~(?P[^~]+)?~', + r'(?msu)(?<=[\s>])_/(?P[^/_]+)?/_', + r'(?msu)(?<=[\s>])_\*(?P[^\*_]+)?\*_', + r'(?msu)(?<=[\s>])\*/(?P[^/\*]+)?/\*', + r'(?msu)(?<=[\s>])_\*/(?P[^\*_]+)?/\*_', + r'(?msu)(?<=[\s>])/:(?P[^:/]+)?:/', + r'(?msu)(?<=[\s>])\|:(?P[^:\|]+)?:\|', ] for word in ITALICIZE_WORDS: diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e1392ef732..b1374bbeec 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile + normalize_line_endings, convert_textile, remove_indents from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -47,6 +47,9 @@ class TXTInput(InputFormatPlugin): OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), + OptionRecommendation(name='txt_in_remove_indents', recommended_value=False, + help=_('Normally extra space at the beginning of lines is retained. ' + 'With this option they will be removed.')), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) @@ -77,20 +80,6 @@ class TXTInput(InputFormatPlugin): # Normalize line endings txt = normalize_line_endings(txt) - # Detect formatting - if options.formatting_type == 'auto': - options.formatting_type = detect_formatting_type(txt) - log.debug('Auto detected formatting as %s' % options.formatting_type) - - if options.formatting_type == 'heuristic': - setattr(options, 'enable_heuristics', True) - setattr(options, 'markup_chapter_headings', True) - setattr(options, 'italicize_common_cases', True) - setattr(options, 'fix_indents', True) - setattr(options, 'delete_blank_paragraphs', True) - setattr(options, 'format_scene_breaks', True) - setattr(options, 'dehyphenate', True) - # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) @@ -99,16 +88,30 @@ class TXTInput(InputFormatPlugin): options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + dehyphenate = False + if options.formatting_type in ('auto', 'heuristic'): + # Set this here because we want it to run over all + # formatting types if auto is used. + dehyphenate = True + + # Detect formatting + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) + log.debug('Auto detected formatting as %s' % options.formatting_type) + + if options.formatting_type == 'heuristic': + setattr(options, 'enable_heuristics', True) + setattr(options, 'unwrap_lines', False) + + if options.txt_in_remove_indents: + txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) - # Get length for hyphen removal and punctuation unwrap - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) - # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -119,9 +122,17 @@ class TXTInput(InputFormatPlugin): elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') + if dehyphenate: + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + dehyphenator = Dehyphenator(options.verbose, log=self.log) + txt = dehyphenator(txt,'txt', length) + # Process the text using the appropriate text processor. html = '' if options.formatting_type == 'markdown': @@ -134,14 +145,8 @@ class TXTInput(InputFormatPlugin): elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) - else: log.debug('Running text through basic conversion...') - if options.formatting_type == 'heuristic': - # Dehyphenate - dehyphenator = Dehyphenator(options.verbose, log=self.log) - txt = dehyphenator(txt,'txt', length) - flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 546d3f1842..987d7cdc73 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -24,14 +24,14 @@ def clean_txt(txt): # all line breaks with \n. txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) - # Replace whitespace at the beginning of the list with   - txt = re.sub('(?m)(?P[ ]+)', lambda mo: ' ' * mo.groups('space').count(' '), txt) - txt = re.sub('(?m)(?P[\t]+)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt) + # Replace whitespace at the beginning of the line with   + txt = re.sub('(?m)(?P^[ ]+)(?=.)', lambda mo: ' ' * mo.groups('space').count(' '), txt) + txt = re.sub('(?m)(?P^[\t]+)(?=.)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt) # Condense redundant spaces txt = re.sub('[ ]{2,}', ' ', txt) - # Remove blank lines from the beginning and end of the document. + # Remove blank space from the beginning and end of the document. txt = re.sub('^\s+(?=.)', '', txt) txt = re.sub('(?<=.)\s+$', '', txt) # Remove excessive line breaks. @@ -107,6 +107,10 @@ def preserve_spaces(txt): txt = txt.replace('\t', '    ') return txt +def remove_indents(txt): + txt = re.sub('(?miu)^\s+', '', txt) + return txt + def opf_writer(path, opf_name, manifest, spine, mi): opf = OPFCreator(path, mi) opf.create_manifest(manifest) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index c2ee3f37c5..fa7bfbb380 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -55,6 +55,7 @@ class TXTMLizer(object): self.log.info('Converting XHTML to TXT...') self.oeb_book = oeb_book self.opts = opts + self.toc_titles = [] self.toc_ids = [] self.last_was_heading = False @@ -94,8 +95,8 @@ class TXTMLizer(object): if getattr(self.opts, 'inline_toc', None): self.log.debug('Generating table of contents...') toc.append(u'%s\n\n' % _(u'Table of Contents:')) - for item in self.oeb_book.toc: - toc.append(u'* %s\n\n' % item.title) + for item in self.toc_titles: + toc.append(u'* %s\n\n' % item) return ''.join(toc) def create_flat_toc(self, nodes): @@ -103,6 +104,7 @@ class TXTMLizer(object): Turns a hierarchical list of TOC href's into a flat list. ''' for item in nodes: + self.toc_titles.append(item.title) self.toc_ids.append(item.href) self.create_flat_toc(item.nodes) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 62672cc0f9..acdf5f43c0 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', + 'preserve_spaces', 'txt_in_remove_indents']) self.db, self.book_id = db, book_id for x in get_option('paragraph_type').option.choices: self.opt_paragraph_type.addItem(x) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 6cbd68135f..211b03294a 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -7,57 +7,95 @@ 0 0 518 - 300 + 353 Form - - - - - Paragraph style: + + + + + Structure + + + + + + 0 + 0 + + + + Paragraph style: + + + + + + + + 0 + 0 + + + + + + + + + 0 + 0 + + + + Formatting style: + + + + + + + + 0 + 0 + + + + + - - - - - - - Preserve &spaces + + + + Common + + + + + Preserve &spaces + + + + + + + Remove indents at the beginning of lines + + + + - - - - Qt::Vertical - - - - 20 - 213 - - - - - - - - - - - Formatting style: - - - - + - Markdown Options + Markdown @@ -83,6 +121,19 @@ + + + + Qt::Vertical + + + + 20 + 213 + + + +