Implement #3359 (Make markdown processing optional)

This commit is contained in:
Kovid Goyal 2009-09-01 17:43:10 -06:00
parent 1a8bb2f142
commit 5f6c330901
9 changed files with 92 additions and 36 deletions

View File

@ -223,16 +223,7 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html): elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html): elif self.is_pdftohtml(html):
end_rules = [] rules = self.PDFTOHTML
if getattr(self.extra_opts, 'unwrap_factor', None):
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
end_rules.append(
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
)
rules = self.PDFTOHTML + end_rules
else: else:
rules = [] rules = []
@ -246,7 +237,16 @@ class HTMLPreProcessor(object):
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
) )
for rule in self.PREPROCESS + pre_rules + rules: end_rules = []
if getattr(self.extra_opts, 'unwrap_factor', None):
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
end_rules.append(
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + pre_rules + rules + end_rules:
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
# Handle broken XHTML w/ SVG (ugh) # Handle broken XHTML w/ SVG (ugh)

View File

@ -262,7 +262,7 @@ class HTMLInput(InputFormatPlugin):
) )
), ),
OptionRecommendation(name='pdf_line_length', recommended_value=0.5, OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
help=_('Average line length for line breaking if the HTML is from a ' help=_('Average line length for line breaking if the HTML is from a '
'previous partial conversion of a PDF file.')), 'previous partial conversion of a PDF file.')),

View File

@ -934,7 +934,7 @@ class Manifest(object):
self.oeb.log.debug('Converting', self.href, '...') self.oeb.log.debug('Converting', self.href, '...')
from calibre.ebooks.txt.processor import txt_to_markdown from calibre.ebooks.txt.processor import convert_markdown
title = self.oeb.metadata.title title = self.oeb.metadata.title
if title: if title:
@ -942,7 +942,7 @@ class Manifest(object):
else: else:
title = _('Unknown') title = _('Unknown')
return self._parse_xhtml(txt_to_markdown(data, title)) return self._parse_xhtml(convert_markdown(data, title))
def _parse_css(self, data): def _parse_css(self, data):

View File

@ -13,8 +13,8 @@ import struct
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.txt.processor import opf_writer from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \
from calibre.ebooks.txt.processor import txt_to_markdown opf_writer
class HeaderRecord(object): class HeaderRecord(object):
''' '''
@ -62,7 +62,9 @@ class Reader(FormatReader):
txt += self.decompress_text(i) txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) if self.single_line_paras:
txt = separate_paragraphs(txt)
html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index: with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8')) index.write(html.encode('utf-8'))

View File

@ -12,7 +12,8 @@ import os, struct, zlib
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError from calibre.ebooks.pdb.ztxt import zTXTError
from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \
opf_writer
SUPPORTED_VERSION = (1, 40) SUPPORTED_VERSION = (1, 40)
@ -77,7 +78,9 @@ class Reader(FormatReader):
txt += self.decompress_text(i) txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
html = txt_to_markdown(txt, single_line_paras=self.single_line_paras) if self.single_line_paras:
txt = separate_paragraphs(txt)
html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index: with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8')) index.write(html.encode('utf-8'))

View File

@ -7,7 +7,8 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import txt_to_markdown from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -21,6 +22,8 @@ class TXTInput(InputFormatPlugin):
help=_('Normally calibre treats blank lines as paragraph markers. ' help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents ' 'With this option it will assume that every line represents '
'a paragraph instead.')), 'a paragraph instead.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input though the markdown processor.')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
@ -31,12 +34,18 @@ class TXTInput(InputFormatPlugin):
log.debug('Reading text from file...') log.debug('Reading text from file...')
txt = stream.read().decode(ienc, 'replace') txt = stream.read().decode(ienc, 'replace')
log.debug('Running text though markdown conversion...') if options.single_line_paras:
try: txt = separate_paragraphs(txt)
html = txt_to_markdown(txt, single_line_paras=options.single_line_paras)
except RuntimeError: if options.markdown:
raise ValueError('This txt file has malformed markup, it cannot be' log.debug('Running text though markdown conversion...')
'converted by calibre. See http://daringfireball.net/projects/markdown/syntax') try:
html = convert_markdown(txt)
except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be'
'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
else:
html = convert_basic(txt)
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html') html_input = plugin_for_input_format('html')

View File

@ -5,7 +5,9 @@ Read content from txt file.
''' '''
import os import os
import re
from calibre import prepare_string_for_xml
from calibre.ebooks.markdown import markdown from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
@ -13,18 +15,41 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
def txt_to_markdown(txt, title='', single_line_paras=False): HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
if single_line_paras:
txt = txt.replace('\r\n', '\n') def convert_basic(txt, title=''):
txt = txt.replace('\r', '\n') lines = []
txt = txt.replace('\n', '\n\n') # Strip whitespace from the beginning and end of the line. Also replace
# all line breaks with \n.
for line in txt.splitlines():
lines.append(line.strip())
txt = '\n'.join(lines)
# Remove blank lines from the beginning and end of the document.
txt = re.sub('^\s+(?=.)', '', txt)
txt = re.sub('(?<=.)\s+$', '', txt)
# Remove excessive line breaks.
txt = re.sub('\n{3,}', '\n\n', txt)
lines = []
# Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'):
if line.strip():
lines.append('<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
return HTML_TEMPLATE % (title, '\n'.join(lines))
def convert_markdown(txt, title=''):
md = markdown.Markdown( md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'], extensions=['footnotes', 'tables', 'toc'],
safe_mode=False,) safe_mode=False,)
html = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>%s</body></html>' % (title, return HTML_TEMPLATE % (title, md.convert(txt))
md.convert(txt))
return html def separate_paragraphs(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt)
return txt
def opf_writer(path, opf_name, manifest, spine, mi): def opf_writer(path, opf_name, manifest, spine, mi):
opf = OPFCreator(path, mi) opf = OPFCreator(path, mi)

View File

@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, 'txt_input', Widget.__init__(self, parent, 'txt_input',
['single_line_paras']) ['single_line_paras', 'markdown'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -14,7 +14,7 @@
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="1" column="0"> <item row="3" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -34,6 +34,23 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_markdown">
<property name="text">
<string>Process using markdown</string>
</property>
</widget>
</item>
<item row="2" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
<resources/> <resources/>