diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index b8b4b93ca1..3688abff3f 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -19,15 +19,20 @@ class PDBInput(InputFormatPlugin): file_types = set(['pdb']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 5f9554665b..e4118c1c0a 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -17,15 +17,20 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e68c47e9b3..47e92a45a9 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces, detect_paragraph_formatting + preserve_spaces, detect_paragraph_type, detect_formatting_type from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -21,15 +21,20 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, @@ -57,29 +62,16 @@ class TXTInput(InputFormatPlugin): log.debug('No input encoding specified and could not auto detect using %s' % ienc) txt = txt.decode(ienc, 'replace') - # Determine the formatting of the document. - if options.paragraph_format == 'auto': - options.paragraph_format = detect_paragraph_formatting(txt) - if options.paragraph_format == 'unknown': - log.debug('Could not reliably determine paragraph format using block format') - options.paragraph_format = 'block' - else: - log.debug('Auto detected paragraph format as %s' % options.paragraph_format) - - # We don't check for block because the processor assumes block. - # single and print at transformed to block for processing. - if options.paragraph_format == 'single': - txt = separate_paragraphs_single_line(txt) - elif options.paragraph_format == 'print': - txt = separate_paragraphs_print_formatted(txt) - txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Preserve spaces will replace multiple spaces to a space # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) + + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) - if options.paragraph_format == 'markdown': + if options.formatting_type == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) @@ -87,6 +79,22 @@ class TXTInput(InputFormatPlugin): raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') else: + # Determine the paragraph type of the document. + if options.paragraph_type == 'auto': + options.paragraph_type = detect_paragraph_type(txt) + if options.paragraph_type == 'unknown': + log.debug('Could not reliably determine paragraph type using block') + options.paragraph_type = 'block' + else: + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_type == 'single': + txt = separate_paragraphs_single_line(txt) + elif options.paragraph_type == 'print': + txt = separate_paragraphs_print_formatted(txt) + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e1014b0c7b..f6d628e7c5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -93,7 +93,7 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt -def detect_paragraph_formatting(txt): +def detect_paragraph_type(txt): ''' Tries to determine the formatting of the document. @@ -109,6 +109,20 @@ def detect_paragraph_formatting(txt): txt = txt.replace('\r', '\n') txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Nothing else matched to assume single. + return 'single' + +def detect_formatting_type(txt): # Check for markdown # Headings if len(re.findall('(?mu)^#+', txt)) >= 5: @@ -129,16 +143,4 @@ def detect_paragraph_formatting(txt): if txt.count('\\'+c) > 10: return 'markdown' - # Check for print - tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: - return 'print' - - # Check for block - empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: - return 'block' - - # Nothing else matched to assume single. - return 'single' - + return 'none' diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py index 655f4025a7..16ff1ff236 100644 --- a/src/calibre/gui2/convert/pdb_input.py +++ b/src/calibre/gui2/convert/pdb_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py index 2aa877ce4d..366643ad5b 100644 --- a/src/calibre/gui2/convert/tcr_input.py +++ b/src/calibre/gui2/convert/tcr_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 99d04fe2f4..62672cc0f9 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index b45297fdf2..6cbd68135f 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -6,7 +6,7 @@ 0 0 - 488 + 518 300 @@ -17,41 +17,21 @@ - Document structure detection + Paragraph style: - + - - - - <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. - - - true - - - true - - - - - - - Do not insert Table of Contents into output text when using markdown - - - - + Preserve &spaces - + Qt::Vertical @@ -64,6 +44,45 @@ + + + + + + + Formatting style: + + + + + + + Markdown Options + + + + + + <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. + + + true + + + true + + + + + + + Do not insert Table of Contents into output text when using markdown + + + + + +