Implement #3359 (Make markdown processing optional)

2025-07-09 03:04:10 -04:00 · 2009-09-01 17:43:10 -06:00 · 2009-09-01 17:43:10 -06:00 · 5f6c330901
commit 5f6c330901
parent 1a8bb2f142
9 changed files with 92 additions and 36 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -223,16 +223,7 @@ class HTMLPreProcessor(object):
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif self.is_pdftohtml(html):
-            end_rules = []
+            rules = self.PDFTOHTML
            if getattr(self.extra_opts, 'unwrap_factor', None):
                length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
                if length:
                    end_rules.append(
                        # Un wrap using punctuation
                        (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                    )
            rules = self.PDFTOHTML + end_rules
        else:
            rules = []
@ -246,7 +237,16 @@ class HTMLPreProcessor(object):
                (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
            )
-        for rule in self.PREPROCESS + pre_rules + rules:
+            end_rules = []
            if getattr(self.extra_opts, 'unwrap_factor', None):
                length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
                if length:
                    end_rules.append(
                        # Un wrap using punctuation
                        (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                    )
        for rule in self.PREPROCESS + pre_rules + rules + end_rules:
            html = rule[0].sub(rule[1], html)
        # Handle broken XHTML w/ SVG (ugh)
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -262,7 +262,7 @@ class HTMLInput(InputFormatPlugin):
                )
        ),
-        OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
+        OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
            help=_('Average line length for line breaking if the HTML is from a '
                'previous partial conversion of a PDF file.')),
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -934,7 +934,7 @@ class Manifest(object):
            self.oeb.log.debug('Converting', self.href, '...')
-            from calibre.ebooks.txt.processor import txt_to_markdown
+            from calibre.ebooks.txt.processor import convert_markdown
            title = self.oeb.metadata.title
            if title:
@ -942,7 +942,7 @@ class Manifest(object):
            else:
                title = _('Unknown')
-            return self._parse_xhtml(txt_to_markdown(data, title))
+            return self._parse_xhtml(convert_markdown(data, title))
        def _parse_css(self, data):
--- a/src/calibre/ebooks/pdb/palmdoc/reader.py
+++ b/src/calibre/ebooks/pdb/palmdoc/reader.py
@ -13,8 +13,8 @@ import struct
 from calibre.ebooks.compression.palmdoc import decompress_doc
 from calibre.ebooks.pdb.formatreader import FormatReader
-from calibre.ebooks.txt.processor import opf_writer
+from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \
-from calibre.ebooks.txt.processor import txt_to_markdown
+    opf_writer
 class HeaderRecord(object):
    '''
@ -62,7 +62,9 @@ class Reader(FormatReader):
            txt += self.decompress_text(i)
        self.log.info('Converting text to OEB...')
-        html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
+        if self.single_line_paras:
            txt = separate_paragraphs(txt)
        html = convert_basic(txt)
        with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
            index.write(html.encode('utf-8'))
--- a/src/calibre/ebooks/pdb/ztxt/reader.py
+++ b/src/calibre/ebooks/pdb/ztxt/reader.py
@ -12,7 +12,8 @@ import os, struct, zlib
 from calibre.ebooks.pdb.formatreader import FormatReader
 from calibre.ebooks.pdb.ztxt import zTXTError
-from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer
+from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \
    opf_writer
 SUPPORTED_VERSION = (1, 40)
@ -77,7 +78,9 @@ class Reader(FormatReader):
            txt += self.decompress_text(i)
        self.log.info('Converting text to OEB...')
-        html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
+        if self.single_line_paras:
            txt = separate_paragraphs(txt)
        html = convert_basic(txt)
        with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
            index.write(html.encode('utf-8'))
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -7,7 +7,8 @@ __docformat__ = 'restructuredtext en'
 import os
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
-from calibre.ebooks.txt.processor import txt_to_markdown
+from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs
 class TXTInput(InputFormatPlugin):
@ -21,6 +22,8 @@ class TXTInput(InputFormatPlugin):
            help=_('Normally calibre treats blank lines as paragraph markers. '
                'With this option it will assume that every line represents '
                'a paragraph instead.')),
        OptionRecommendation(name='markdown', recommended_value=False,
            help=_('Run the text input though the markdown processor.')),
    ])
    def convert(self, stream, options, file_ext, log,
@ -31,12 +34,18 @@ class TXTInput(InputFormatPlugin):
        log.debug('Reading text from file...')
        txt = stream.read().decode(ienc, 'replace')
-        log.debug('Running text though markdown conversion...')
+        if options.single_line_paras:
-        try:
+            txt = separate_paragraphs(txt)
-            html = txt_to_markdown(txt, single_line_paras=options.single_line_paras)
+
-        except RuntimeError:
+        if options.markdown:
-            raise ValueError('This txt file has malformed markup, it cannot be'
+            log.debug('Running text though markdown conversion...')
-                'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
+            try:
                html = convert_markdown(txt)
            except RuntimeError:
                raise ValueError('This txt file has malformed markup, it cannot be'
                    'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
        else:
            html = convert_basic(txt)
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -5,7 +5,9 @@ Read content from txt file.
 '''
 import os
 import re
 from calibre import prepare_string_for_xml
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
@ -13,18 +15,41 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
-def txt_to_markdown(txt, title='', single_line_paras=False):
+HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
-    if single_line_paras:
+
-        txt = txt.replace('\r\n', '\n')
+def convert_basic(txt, title=''):
-        txt = txt.replace('\r', '\n')
+    lines = []
-        txt = txt.replace('\n', '\n\n')
+    # Strip whitespace from the beginning and end of the line. Also replace
    # all line breaks with \n.
    for line in txt.splitlines():
        lines.append(line.strip())
    txt = '\n'.join(lines)
    # Remove blank lines from the beginning and end of the document.
    txt = re.sub('^\s+(?=.)', '', txt)
    txt = re.sub('(?<=.)\s+$', '', txt)
    # Remove excessive line breaks.
    txt = re.sub('\n{3,}', '\n\n', txt)
    lines = []
    # Split into paragraphs based on having a blank line between text.
    for line in txt.split('\n\n'):
        if line.strip():
            lines.append('<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
    return HTML_TEMPLATE % (title, '\n'.join(lines))
 def convert_markdown(txt, title=''):
    md = markdown.Markdown(
        extensions=['footnotes', 'tables', 'toc'],
        safe_mode=False,)
-    html = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>%s</body></html>' % (title,
+    return HTML_TEMPLATE % (title, md.convert(txt))
        md.convert(txt))
-    return html
+def separate_paragraphs(txt):
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt)
    return txt
 def opf_writer(path, opf_name, manifest, spine, mi):
    opf = OPFCreator(path, mi)
--- a/src/calibre/gui2/convert/txt_input.py
+++ b/src/calibre/gui2/convert/txt_input.py
@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form):
    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent, 'txt_input',
-            ['single_line_paras'])
+            ['single_line_paras', 'markdown'])
        self.db, self.book_id = db, book_id
        self.initialize_options(get_option, get_help, db, book_id)
--- a/src/calibre/gui2/convert/txt_input.ui
+++ b/src/calibre/gui2/convert/txt_input.ui
@ -14,7 +14,7 @@
   <string>Form</string>
  </property>
  <layout class="QGridLayout" name="gridLayout">
-   <item row="1" column="0">
+   <item row="3" column="0">
    <spacer name="verticalSpacer">
     <property name="orientation">
      <enum>Qt::Vertical</enum>
@ -34,6 +34,23 @@
     </property>
    </widget>
   </item>
   <item row="1" column="0">
    <widget class="QCheckBox" name="opt_markdown">
     <property name="text">
      <string>Process using markdown</string>
     </property>
    </widget>
   </item>
   <item row="2" column="0">
    <widget class="QLabel" name="label">
     <property name="text">
      <string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
     </property>
     <property name="wordWrap">
      <bool>true</bool>
     </property>
    </widget>
   </item>
  </layout>
 </widget>
 <resources/>