mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Output: Add option to generate Markdown output. Turn <br> tags into spaces.
This commit is contained in:
commit
f2f81550db
63
src/calibre/ebooks/txt/markdownml.py
Normal file
63
src/calibre/ebooks/txt/markdownml.py
Normal file
@ -0,0 +1,63 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into Markdown formatted plain text
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.utils.html2text import html2text
|
||||
|
||||
class MarkdownMLizer(object):
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
self.log.info('Converting XHTML to Markdown formatted TXT...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
|
||||
return self.mlize_spine()
|
||||
|
||||
def mlize_spine(self):
|
||||
output = [u'']
|
||||
|
||||
for item in self.oeb_book.spine:
|
||||
self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
|
||||
|
||||
html = unicode(etree.tostring(item.data, encoding=unicode))
|
||||
|
||||
if not self.opts.keep_links:
|
||||
html = re.sub(r'<\s*a[^>]*>', '', html)
|
||||
html = re.sub(r'<\s*/\s*a\s*>', '', html)
|
||||
if not self.opts.keep_image_references:
|
||||
html = re.sub(r'<\s*img[^>]*>', '', html)
|
||||
html = re.sub(r'<\s*img\s*>', '', html)
|
||||
|
||||
text = html2text(html)
|
||||
|
||||
# Ensure the section ends with at least two new line characters.
|
||||
# This is to prevent the last paragraph from a section being
|
||||
# combined into the fist paragraph of the next.
|
||||
end_chars = text[-4:]
|
||||
# Convert all newlines to \n
|
||||
end_chars = end_chars.replace('\r\n', '\n')
|
||||
end_chars = end_chars.replace('\r', '\n')
|
||||
end_chars = end_chars[-2:]
|
||||
if not end_chars[1] == '\n':
|
||||
text += '\n\n'
|
||||
if end_chars[1] == '\n' and not end_chars[0] == '\n':
|
||||
text += '\n'
|
||||
|
||||
output += text
|
||||
|
||||
output = u''.join(output)
|
||||
|
||||
return output
|
@ -8,6 +8,7 @@ import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
|
||||
|
||||
@ -44,10 +45,27 @@ class TXTOutput(OutputFormatPlugin):
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Force splitting on the max-line-length value when no space '
|
||||
'is present. Also allows max-line-length to be below the minimum')),
|
||||
OptionRecommendation(name='markdown_format',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Produce Markdown formatted text.')),
|
||||
OptionRecommendation(name='keep_links',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove links within the document. This is only ' \
|
||||
'useful when paired with the markdown-format option because' \
|
||||
'links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_image_references',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove image references within the document. This is only ' \
|
||||
'useful when paired with the markdown-format option because' \
|
||||
'image references are always removed with plain text output.')),
|
||||
])
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
writer = TXTMLizer(log)
|
||||
if opts.markdown_format:
|
||||
writer = MarkdownMLizer(log)
|
||||
else:
|
||||
writer = TXTMLizer(log)
|
||||
|
||||
txt = writer.extract_content(oeb_book, opts)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
|
@ -35,6 +35,7 @@ BLOCK_STYLES = [
|
||||
|
||||
SPACE_TAGS = [
|
||||
'td',
|
||||
'br',
|
||||
]
|
||||
|
||||
class TXTMLizer(object):
|
||||
|
@ -21,7 +21,7 @@ class PluginWidget(Widget, Ui_Form):
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['newline', 'max_line_length', 'force_max_line_length',
|
||||
'inline_toc'])
|
||||
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references'])
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>400</width>
|
||||
<width>477</width>
|
||||
<height>300</height>
|
||||
</rect>
|
||||
</property>
|
||||
@ -27,7 +27,7 @@
|
||||
<item row="0" column="1">
|
||||
<widget class="QComboBox" name="opt_newline"/>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<item row="7" column="0">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -67,6 +67,27 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<widget class="QCheckBox" name="opt_markdown_format">
|
||||
<property name="text">
|
||||
<string>Apply Markdown formatting to text</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="5" column="0">
|
||||
<widget class="QCheckBox" name="opt_keep_links">
|
||||
<property name="text">
|
||||
<string>Do not remove links (<a> tags) before processing</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0">
|
||||
<widget class="QCheckBox" name="opt_keep_image_references">
|
||||
<property name="text">
|
||||
<string>Do not remove image references before processing</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<resources/>
|
||||
|
Loading…
x
Reference in New Issue
Block a user