mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement #3359 (Make markdown processing optional)
This commit is contained in:
parent
1a8bb2f142
commit
5f6c330901
@ -223,16 +223,7 @@ class HTMLPreProcessor(object):
|
|||||||
elif self.is_book_designer(html):
|
elif self.is_book_designer(html):
|
||||||
rules = self.BOOK_DESIGNER
|
rules = self.BOOK_DESIGNER
|
||||||
elif self.is_pdftohtml(html):
|
elif self.is_pdftohtml(html):
|
||||||
end_rules = []
|
rules = self.PDFTOHTML
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', None):
|
|
||||||
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
|
||||||
if length:
|
|
||||||
end_rules.append(
|
|
||||||
# Un wrap using punctuation
|
|
||||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
|
||||||
)
|
|
||||||
|
|
||||||
rules = self.PDFTOHTML + end_rules
|
|
||||||
else:
|
else:
|
||||||
rules = []
|
rules = []
|
||||||
|
|
||||||
@ -246,7 +237,16 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + pre_rules + rules:
|
end_rules = []
|
||||||
|
if getattr(self.extra_opts, 'unwrap_factor', None):
|
||||||
|
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
|
if length:
|
||||||
|
end_rules.append(
|
||||||
|
# Un wrap using punctuation
|
||||||
|
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
||||||
|
)
|
||||||
|
|
||||||
|
for rule in self.PREPROCESS + pre_rules + rules + end_rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
|
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
|
@ -262,7 +262,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
|
OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
|
||||||
help=_('Average line length for line breaking if the HTML is from a '
|
help=_('Average line length for line breaking if the HTML is from a '
|
||||||
'previous partial conversion of a PDF file.')),
|
'previous partial conversion of a PDF file.')),
|
||||||
|
|
||||||
|
@ -934,7 +934,7 @@ class Manifest(object):
|
|||||||
|
|
||||||
self.oeb.log.debug('Converting', self.href, '...')
|
self.oeb.log.debug('Converting', self.href, '...')
|
||||||
|
|
||||||
from calibre.ebooks.txt.processor import txt_to_markdown
|
from calibre.ebooks.txt.processor import convert_markdown
|
||||||
|
|
||||||
title = self.oeb.metadata.title
|
title = self.oeb.metadata.title
|
||||||
if title:
|
if title:
|
||||||
@ -942,7 +942,7 @@ class Manifest(object):
|
|||||||
else:
|
else:
|
||||||
title = _('Unknown')
|
title = _('Unknown')
|
||||||
|
|
||||||
return self._parse_xhtml(txt_to_markdown(data, title))
|
return self._parse_xhtml(convert_markdown(data, title))
|
||||||
|
|
||||||
|
|
||||||
def _parse_css(self, data):
|
def _parse_css(self, data):
|
||||||
|
@ -13,8 +13,8 @@ import struct
|
|||||||
|
|
||||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
from calibre.ebooks.txt.processor import opf_writer
|
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \
|
||||||
from calibre.ebooks.txt.processor import txt_to_markdown
|
opf_writer
|
||||||
|
|
||||||
class HeaderRecord(object):
|
class HeaderRecord(object):
|
||||||
'''
|
'''
|
||||||
@ -62,7 +62,9 @@ class Reader(FormatReader):
|
|||||||
txt += self.decompress_text(i)
|
txt += self.decompress_text(i)
|
||||||
|
|
||||||
self.log.info('Converting text to OEB...')
|
self.log.info('Converting text to OEB...')
|
||||||
html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
|
if self.single_line_paras:
|
||||||
|
txt = separate_paragraphs(txt)
|
||||||
|
html = convert_basic(txt)
|
||||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||||
index.write(html.encode('utf-8'))
|
index.write(html.encode('utf-8'))
|
||||||
|
|
||||||
|
@ -12,7 +12,8 @@ import os, struct, zlib
|
|||||||
|
|
||||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
from calibre.ebooks.pdb.ztxt import zTXTError
|
from calibre.ebooks.pdb.ztxt import zTXTError
|
||||||
from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer
|
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs, \
|
||||||
|
opf_writer
|
||||||
|
|
||||||
SUPPORTED_VERSION = (1, 40)
|
SUPPORTED_VERSION = (1, 40)
|
||||||
|
|
||||||
@ -77,7 +78,9 @@ class Reader(FormatReader):
|
|||||||
txt += self.decompress_text(i)
|
txt += self.decompress_text(i)
|
||||||
|
|
||||||
self.log.info('Converting text to OEB...')
|
self.log.info('Converting text to OEB...')
|
||||||
html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
|
if self.single_line_paras:
|
||||||
|
txt = separate_paragraphs(txt)
|
||||||
|
html = convert_basic(txt)
|
||||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||||
index.write(html.encode('utf-8'))
|
index.write(html.encode('utf-8'))
|
||||||
|
|
||||||
|
@ -7,7 +7,8 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||||
from calibre.ebooks.txt.processor import txt_to_markdown
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
|
separate_paragraphs
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -21,6 +22,8 @@ class TXTInput(InputFormatPlugin):
|
|||||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||||
'With this option it will assume that every line represents '
|
'With this option it will assume that every line represents '
|
||||||
'a paragraph instead.')),
|
'a paragraph instead.')),
|
||||||
|
OptionRecommendation(name='markdown', recommended_value=False,
|
||||||
|
help=_('Run the text input though the markdown processor.')),
|
||||||
])
|
])
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
@ -31,12 +34,18 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
txt = stream.read().decode(ienc, 'replace')
|
txt = stream.read().decode(ienc, 'replace')
|
||||||
|
|
||||||
log.debug('Running text though markdown conversion...')
|
if options.single_line_paras:
|
||||||
try:
|
txt = separate_paragraphs(txt)
|
||||||
html = txt_to_markdown(txt, single_line_paras=options.single_line_paras)
|
|
||||||
except RuntimeError:
|
if options.markdown:
|
||||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
log.debug('Running text though markdown conversion...')
|
||||||
'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
try:
|
||||||
|
html = convert_markdown(txt)
|
||||||
|
except RuntimeError:
|
||||||
|
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||||
|
'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
||||||
|
else:
|
||||||
|
html = convert_basic(txt)
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
html_input = plugin_for_input_format('html')
|
html_input = plugin_for_input_format('html')
|
||||||
|
@ -5,7 +5,9 @@ Read content from txt file.
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre import prepare_string_for_xml
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
|
||||||
@ -13,18 +15,41 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
def txt_to_markdown(txt, title='', single_line_paras=False):
|
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||||
if single_line_paras:
|
|
||||||
txt = txt.replace('\r\n', '\n')
|
def convert_basic(txt, title=''):
|
||||||
txt = txt.replace('\r', '\n')
|
lines = []
|
||||||
txt = txt.replace('\n', '\n\n')
|
# Strip whitespace from the beginning and end of the line. Also replace
|
||||||
|
# all line breaks with \n.
|
||||||
|
for line in txt.splitlines():
|
||||||
|
lines.append(line.strip())
|
||||||
|
txt = '\n'.join(lines)
|
||||||
|
|
||||||
|
# Remove blank lines from the beginning and end of the document.
|
||||||
|
txt = re.sub('^\s+(?=.)', '', txt)
|
||||||
|
txt = re.sub('(?<=.)\s+$', '', txt)
|
||||||
|
# Remove excessive line breaks.
|
||||||
|
txt = re.sub('\n{3,}', '\n\n', txt)
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
# Split into paragraphs based on having a blank line between text.
|
||||||
|
for line in txt.split('\n\n'):
|
||||||
|
if line.strip():
|
||||||
|
lines.append('<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
||||||
|
|
||||||
|
return HTML_TEMPLATE % (title, '\n'.join(lines))
|
||||||
|
|
||||||
|
def convert_markdown(txt, title=''):
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
extensions=['footnotes', 'tables', 'toc'],
|
extensions=['footnotes', 'tables', 'toc'],
|
||||||
safe_mode=False,)
|
safe_mode=False,)
|
||||||
html = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>%s</body></html>' % (title,
|
return HTML_TEMPLATE % (title, md.convert(txt))
|
||||||
md.convert(txt))
|
|
||||||
|
|
||||||
return html
|
def separate_paragraphs(txt):
|
||||||
|
txt = txt.replace('\r\n', '\n')
|
||||||
|
txt = txt.replace('\r', '\n')
|
||||||
|
txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt)
|
||||||
|
return txt
|
||||||
|
|
||||||
def opf_writer(path, opf_name, manifest, spine, mi):
|
def opf_writer(path, opf_name, manifest, spine, mi):
|
||||||
opf = OPFCreator(path, mi)
|
opf = OPFCreator(path, mi)
|
||||||
|
@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent, 'txt_input',
|
Widget.__init__(self, parent, 'txt_input',
|
||||||
['single_line_paras'])
|
['single_line_paras', 'markdown'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item row="1" column="0">
|
<item row="3" column="0">
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -34,6 +34,23 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QCheckBox" name="opt_markdown">
|
||||||
|
<property name="text">
|
||||||
|
<string>Process using markdown</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLabel" name="label">
|
||||||
|
<property name="text">
|
||||||
|
<string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string>
|
||||||
|
</property>
|
||||||
|
<property name="wordWrap">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<resources/>
|
<resources/>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user