diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index cb2564ec0a..029b9752e1 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -223,16 +223,7 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - end_rules = [] - if getattr(self.extra_opts, 'unwrap_factor', None): - length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) - if length: - end_rules.append( - # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), - ) - - rules = self.PDFTOHTML + end_rules + rules = self.PDFTOHTML else: rules = [] @@ -246,7 +237,16 @@ class HTMLPreProcessor(object): (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') ) - for rule in self.PREPROCESS + pre_rules + rules: + end_rules = [] + if getattr(self.extra_opts, 'unwrap_factor', None): + length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) + if length: + end_rules.append( + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + ) + + for rule in self.PREPROCESS + pre_rules + rules + end_rules: html = rule[0].sub(rule[1], html) # Handle broken XHTML w/ SVG (ugh) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 92c2df9690..7b7bfdf3aa 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -262,7 +262,7 @@ class HTMLInput(InputFormatPlugin): ) ), - OptionRecommendation(name='pdf_line_length', recommended_value=0.5, + OptionRecommendation(name='unwrap_factor', recommended_value=0.5, help=_('Average line length for line breaking if the HTML is from a ' 'previous partial conversion of a PDF file.')), diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 3f9e6a4d4a..2e06fffe4e 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -934,7 +934,7 @@ class Manifest(object): self.oeb.log.debug('Converting', self.href, '...') - from calibre.ebooks.txt.processor import txt_to_markdown + from calibre.ebooks.txt.processor import convert_markdown title = self.oeb.metadata.title if title: @@ -942,7 +942,7 @@ class Manifest(object): else: title = _('Unknown') - return self._parse_xhtml(txt_to_markdown(data, title)) + return self._parse_xhtml(convert_markdown(data, title)) def _parse_css(self, data): diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index e1935db566..8992382597 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -13,8 +13,8 @@ import struct from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import opf_writer -from calibre.ebooks.txt.processor import txt_to_markdown +from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ + opf_writer class HeaderRecord(object): ''' @@ -62,7 +62,9 @@ class Reader(FormatReader): txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) + if self.single_line_paras: + txt = separate_paragraphs(txt) + html = convert_basic(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 86c5abfe82..664f498bee 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -12,7 +12,8 @@ import os, struct, zlib from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer +from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \ + opf_writer SUPPORTED_VERSION = (1, 40) @@ -77,7 +78,9 @@ class Reader(FormatReader): txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) + if self.single_line_paras: + txt = separate_paragraphs(txt) + html = convert_basic(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5d84a1bde1..2b0245c98b 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,7 +7,8 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import txt_to_markdown +from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ + separate_paragraphs class TXTInput(InputFormatPlugin): @@ -21,6 +22,8 @@ class TXTInput(InputFormatPlugin): help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line represents ' 'a paragraph instead.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input though the markdown processor.')), ]) def convert(self, stream, options, file_ext, log, @@ -31,12 +34,18 @@ class TXTInput(InputFormatPlugin): log.debug('Reading text from file...') txt = stream.read().decode(ienc, 'replace') - log.debug('Running text though markdown conversion...') - try: - html = txt_to_markdown(txt, single_line_paras=options.single_line_paras) - except RuntimeError: - raise ValueError('This txt file has malformed markup, it cannot be' - 'converted by calibre. See http://daringfireball.net/projects/markdown/syntax') + if options.single_line_paras: + txt = separate_paragraphs(txt) + + if options.markdown: + log.debug('Running text though markdown conversion...') + try: + html = convert_markdown(txt) + except RuntimeError: + raise ValueError('This txt file has malformed markup, it cannot be' + 'converted by calibre. See http://daringfireball.net/projects/markdown/syntax') + else: + html = convert_basic(txt) from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 3005d633b8..f6503c0bc5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -5,7 +5,9 @@ Read content from txt file. ''' import os +import re +from calibre import prepare_string_for_xml from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator @@ -13,18 +15,41 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -def txt_to_markdown(txt, title='', single_line_paras=False): - if single_line_paras: - txt = txt.replace('\r\n', '\n') - txt = txt.replace('\r', '\n') - txt = txt.replace('\n', '\n\n') +HTML_TEMPLATE = u'%s\n%s\n' + +def convert_basic(txt, title=''): + lines = [] + # Strip whitespace from the beginning and end of the line. Also replace + # all line breaks with \n. + for line in txt.splitlines(): + lines.append(line.strip()) + txt = '\n'.join(lines) + + # Remove blank lines from the beginning and end of the document. + txt = re.sub('^\s+(?=.)', '', txt) + txt = re.sub('(?<=.)\s+$', '', txt) + # Remove excessive line breaks. + txt = re.sub('\n{3,}', '\n\n', txt) + + lines = [] + # Split into paragraphs based on having a blank line between text. + for line in txt.split('\n\n'): + if line.strip(): + lines.append('

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + + return HTML_TEMPLATE % (title, '\n'.join(lines)) + +def convert_markdown(txt, title=''): md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], safe_mode=False,) - html = u'%s%s' % (title, - md.convert(txt)) + return HTML_TEMPLATE % (title, md.convert(txt)) - return html +def separate_paragraphs(txt): + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt) + return txt def opf_writer(path, opf_name, manifest, spine, mi): opf = OPFCreator(path, mi) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 71dbbe1fe2..3d17eefe0d 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, 'txt_input', - ['single_line_paras']) + ['single_line_paras', 'markdown']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 191e749833..8c22ff721e 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -14,7 +14,7 @@ Form - + Qt::Vertical @@ -34,6 +34,23 @@ + + + + Process using markdown + + + + + + + <p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>. + + + true + + +