From c5d6f9665b5a83e088d3bbe5f5412c6a6bce9b6b Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 12 Feb 2011 14:33:32 -0500 Subject: [PATCH 1/5] TXT Input: Rename none formatting-type to plain to correspond to the output option. --- src/calibre/ebooks/txt/input.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 8ab1524b02..7d218a36cf 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -38,10 +38,10 @@ class TXTInput(InputFormatPlugin): 'starts a paragraph.' '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')), OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'heuristic', 'textile', 'markdown'], + choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'], help=_('Formatting used within the document.' '* auto: Automatically decide which formatting processor to use.\n' - '* none: Do not process the document formatting. Everything is a ' + '* plain: Do not process the document formatting. Everything is a ' 'paragraph and no styling is applied.\n' '* heuristic: Process using heuristics to determine formatting such ' 'as chapter headings and italic text.\n' From 95892f204b22f0ea2aef5a06eea99c5a6d18c82e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 12 Feb 2011 14:38:55 -0500 Subject: [PATCH 2/5] TXT Input: New paragraph-type option (off) to disable modifying the paragraph structure. --- src/calibre/ebooks/txt/input.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 7d218a36cf..6d958b0e7c 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -28,15 +28,18 @@ class TXTInput(InputFormatPlugin): options = set([ OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'unformatted'], + choices=['auto', 'block', 'single', 'print', 'unformatted', 'off'], help=_('Paragraph structure.\n' - 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\', \'off\']\n' '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.' - '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')), + 'starts a paragraph.\n' + '* unformatted: Most lines have hard line breaks, few/no blank lines or indents. ' + 'Tries to determine structure and reformat the differentiate elements.\n' + '* off: Don\'t modify the paragraph structure. This is useful when combined with ' + 'Markdown or Textile formatting to ensure no formatting is lost.')), OptionRecommendation(name='formatting_type', recommended_value='auto', choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'], help=_('Formatting used within the document.' @@ -134,7 +137,7 @@ class TXTInput(InputFormatPlugin): preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) - else: + elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) From 04b80eb9eef7452d3ff70ceece2144e0b718f976 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 12 Feb 2011 17:34:29 -0500 Subject: [PATCH 3/5] TXTZ Input: Include images in manifest. They were not always being included by the HTML input plugin. --- src/calibre/ebooks/txt/input.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 6d958b0e7c..9952845fdf 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -4,13 +4,15 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import glob +import mimetypes import os +import shutil -from calibre import _ent_pat, xml_entity_to_unicode +from calibre import _ent_pat, walk, xml_entity_to_unicode from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect +from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ @@ -67,6 +69,8 @@ class TXTInput(InputFormatPlugin): txt = '' log.debug('Reading text from file...') length = 0 + # [(u'path', mime),] + images = [] # Extract content from zip archive. if file_ext == 'txtz': @@ -75,10 +79,19 @@ class TXTInput(InputFormatPlugin): zf = ZipFile(stream) zf.extractall(tdir) - txts = glob.glob(os.path.join(tdir, '*.txt')) - for t in txts: - with open(t, 'rb') as tf: - txt += tf.read() + for x in walk(tdir): + if not os.path.isfile(x): + continue + if os.path.splitext(x)[1].lower() == '.txt': + with open(x, 'rb') as tf: + txt += tf.read() + '\n\n' + if mimetypes.guess_type(x)[0] in OEB_IMAGES: + path = os.path.relpath(x, tdir) + dir = os.path.join(os.getcwd(), os.path.dirname(path)) + if not os.path.exists(dir): + os.makedirs(dir) + shutil.copy(x, os.path.join(os.getcwd(), path)) + images.append((path, mimetypes.guess_type(x)[0])) else: txt = stream.read() @@ -193,9 +206,13 @@ class TXTInput(InputFormatPlugin): htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None - # Generate oeb from htl conversion. + # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) + # Add images from from txtz archive to oeb. + for image, mime in images: + id, href = oeb.manifest.generate(id='image', href=image) + oeb.manifest.add(id, href, mime) options.debug_pipeline = odi os.remove(htmlfile.name) From ffaed91cdcb10b88fb9c13787f4b347ff1fc9a22 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 13 Feb 2011 14:23:03 -0500 Subject: [PATCH 4/5] TXT Input: Add _ to set of characters that can make up a hard scene break. --- src/calibre/ebooks/txt/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 55213381c9..7e161f63bd 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -126,7 +126,7 @@ def separate_hard_scene_breaks(txt): return '\n%s\n' % line else: return line - txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt) + txt = re.sub(u'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt) return txt def block_to_single_line(txt): From aed26584c1c25172a219aacf7390ad0fb9150a79 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 13 Feb 2011 14:29:46 -0500 Subject: [PATCH 5/5] Fix bug #8960: None inserted before ___ in text input. Tweak italicize common cases regex patterns. --- src/calibre/ebooks/conversion/utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 95f832c76a..2e26f927f5 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -156,17 +156,17 @@ class HeuristicProcessor(object): ] ITALICIZE_STYLE_PATS = [ - r'(?msu)(?<=[\s>])_(?P[^_]+)?_', - r'(?msu)(?<=[\s>])/(?P[^/]+)?/', - r'(?msu)(?<=[\s>])~~(?P[^~]+)?~~', - r'(?msu)(?<=[\s>])\*(?P[^\*]+)?\*', - r'(?msu)(?<=[\s>])~(?P[^~]+)?~', - r'(?msu)(?<=[\s>])_/(?P[^/_]+)?/_', - r'(?msu)(?<=[\s>])_\*(?P[^\*_]+)?\*_', - r'(?msu)(?<=[\s>])\*/(?P[^/\*]+)?/\*', - r'(?msu)(?<=[\s>])_\*/(?P[^\*_]+)?/\*_', - r'(?msu)(?<=[\s>])/:(?P[^:/]+)?:/', - r'(?msu)(?<=[\s>])\|:(?P[^:\|]+)?:\|', + r'(?msu)(?<=[\s>])_(?P[^_]+)_', + r'(?msu)(?<=[\s>])/(?P[^/]+)/', + r'(?msu)(?<=[\s>])~~(?P[^~]+)~~', + r'(?msu)(?<=[\s>])\*(?P[^\*]+)\*', + r'(?msu)(?<=[\s>])~(?P[^~]+)~', + r'(?msu)(?<=[\s>])_/(?P[^/_]+)/_', + r'(?msu)(?<=[\s>])_\*(?P[^\*_]+)\*_', + r'(?msu)(?<=[\s>])\*/(?P[^/\*]+)/\*', + r'(?msu)(?<=[\s>])_\*/(?P[^\*_]+)/\*_', + r'(?msu)(?<=[\s>])/:(?P[^:/]+):/', + r'(?msu)(?<=[\s>])\|:(?P[^:\|]+):\|', ] for word in ITALICIZE_WORDS: