From ed3b2866cfb846c7ccbb39e64711fb37d56e927c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Feb 2011 08:35:07 -0500 Subject: [PATCH 1/5] Sync ldolse preprocessing changes. --- src/calibre/ebooks/txt/input.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 75bafc7cef..a07b423ebb 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -57,6 +57,7 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): self.log = log + length = None log.debug('Reading text from file...') txt = stream.read() @@ -109,7 +110,7 @@ class TXTInput(InputFormatPlugin): # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. - if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted': + if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) @@ -120,10 +121,12 @@ class TXTInput(InputFormatPlugin): length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') + txt = separate_paragraphs_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) + if not length: + length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) From 92ee46cdb9bc070aa9fa71df2c59bae77855b044 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Feb 2011 08:35:55 -0500 Subject: [PATCH 2/5] TXT Input: Retain indents with print formatted paragraphs. Move remove indents to keep print formatting working. --- src/calibre/ebooks/txt/input.py | 17 +++++++++-------- src/calibre/ebooks/txt/processor.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index a07b423ebb..7253596801 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin): setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) - if options.txt_in_remove_indents: - txt = remove_indents(txt) - - # Preserve spaces will replace multiple spaces to a space - # followed by the   entity. - if options.preserve_spaces: - txt = preserve_spaces(txt) - # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -130,6 +122,15 @@ class TXTInput(InputFormatPlugin): dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) + # User requested transformation on the text. + if options.txt_in_remove_indents: + txt = remove_indents(txt) + + # Preserve spaces will replace multiple spaces to a space + # followed by the   entity. + if options.preserve_spaces: + txt = preserve_spaces(txt) + # Process the text using the appropriate text processor. html = '' if options.formatting_type == 'markdown': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 987d7cdc73..ebbdc9eb07 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -99,7 +99,7 @@ def separate_paragraphs_single_line(txt): return txt def separate_paragraphs_print_formatted(txt): - txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt) + txt = re.sub(u'(?miu)^(?P\t+|[ ]{2,})(?=.)', lambda mo: '%s\n\t' % mo.group('indent'), txt) return txt def preserve_spaces(txt): From 2d4fc57ddcacb7b04d2ab4d89a1c08125d8e5df8 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Feb 2011 08:45:50 -0500 Subject: [PATCH 3/5] TXT Input: convet_basic changed at some point to require single line paragraphs. Add function to turn block formatted paragraphs to single so they are processed correctly. --- src/calibre/ebooks/txt/input.py | 5 ++++- src/calibre/ebooks/txt/processor.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 7253596801..60adf4bd7a 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile, remove_indents + normalize_line_endings, convert_textile, remove_indents, block_to_single_line from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -106,6 +106,7 @@ class TXTInput(InputFormatPlugin): txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) + txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation @@ -114,6 +115,8 @@ class TXTInput(InputFormatPlugin): preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) + else: + txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index ebbdc9eb07..e4e7772ce7 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -99,7 +99,11 @@ def separate_paragraphs_single_line(txt): return txt def separate_paragraphs_print_formatted(txt): - txt = re.sub(u'(?miu)^(?P\t+|[ ]{2,})(?=.)', lambda mo: '%s\n\t' % mo.group('indent'), txt) + txt = re.sub(u'(?miu)^(?P\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) + return txt + +def block_to_single_line(txt): + txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt) return txt def preserve_spaces(txt): From 0916a9dc348a73f665eb7de6e0cba1f725f5f356 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Feb 2011 08:47:17 -0500 Subject: [PATCH 4/5] ... --- src/calibre/ebooks/txt/processor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e4e7772ce7..b91191e9fe 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -59,6 +59,9 @@ def split_txt(txt, epub_split_size_kb=0): return txt def convert_basic(txt, title='', epub_split_size_kb=0): + ''' + Requires paragraphs to be in single line format. + ''' txt = clean_txt(txt) txt = split_txt(txt, epub_split_size_kb) From b1b4e7bac58881c9970034048247e2bd8c288ce6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Feb 2011 10:12:43 -0500 Subject: [PATCH 5/5] TXT Processing: Comments. --- src/calibre/ebooks/txt/processor.py | 41 +++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index b91191e9fe..f7b6cce234 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars HTML_TEMPLATE = u'%s\n%s\n' def clean_txt(txt): + ''' + Run transformations on the text to put it into + consistent state. + ''' if isbytestring(txt): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the end of the line. Also replace @@ -42,6 +46,15 @@ def clean_txt(txt): return txt def split_txt(txt, epub_split_size_kb=0): + ''' + Ensure there are split points for converting + to EPUB. A misdetected paragraph type can + result in the entire document being one giant + paragraph. In this case the EPUB parser will not + be able to determine where to split the file + to accomidate the EPUB file size limitation + and will fail. + ''' #Takes care if there is no point to split if epub_split_size_kb > 0: if isinstance(txt, unicode): @@ -60,6 +73,9 @@ def split_txt(txt, epub_split_size_kb=0): def convert_basic(txt, title='', epub_split_size_kb=0): ''' + Converts plain text to html by putting all paragraphs in +

tags. It condense and retains blank lines when necessary. + Requires paragraphs to be in single line format. ''' txt = clean_txt(txt) @@ -110,11 +126,17 @@ def block_to_single_line(txt): return txt def preserve_spaces(txt): + ''' + Replaces spaces multiple spaces with   entities. + ''' txt = re.sub('(?P[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt) txt = txt.replace('\t', '    ') return txt def remove_indents(txt): + ''' + Remove whitespace at the beginning of each line. + ''' txt = re.sub('(?miu)^\s+', '', txt) return txt @@ -125,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi): with open(os.path.join(path, opf_name), 'wb') as opffile: opf.render(opffile) -def split_string_separator(txt, size) : +def split_string_separator(txt, size): + ''' + Splits the text by putting \n\n at the point size. + ''' if len(txt) > size: txt = ''.join([re.sub(u'\.(?P[^.]*)$', '.\n\n\g', txt[i:i+size], 1) for i in @@ -134,7 +159,7 @@ def split_string_separator(txt, size) : def detect_paragraph_type(txt): ''' - Tries to determine the formatting of the document. + Tries to determine the paragraph type of the document. block: Paragraphs are separated by a blank line. single: Each line is a paragraph. @@ -177,6 +202,16 @@ def detect_paragraph_type(txt): def detect_formatting_type(txt): + ''' + Tries to determine the formatting of the document. + + markdown: Markdown formatting is used. + textile: Textile formatting is used. + heuristic: When none of the above formatting types are + detected heuristic is returned. + ''' + # Keep a count of the number of format specific object + # that are found in the text. markdown_count = 0 textile_count = 0 @@ -200,6 +235,8 @@ def detect_formatting_type(txt): # Links textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) + # Decide if either markdown or textile is used in the text + # based on the number of unique formatting elements found. if markdown_count > 5 or textile_count > 5: if markdown_count > textile_count: return 'markdown'