mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
TXT Input: Add support for embedded images that use relative URLs when converting markdown or textile. Note that this will only work if you are converting using the ebook-convert command line tool as the main calibre program moves files around, so relative references will not be valid.
This commit is contained in:
parent
49c048a028
commit
bff950e52d
@ -67,10 +67,40 @@ class TXTInput(InputFormatPlugin):
|
|||||||
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
|
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
|
||||||
'of the standard markdown format. The extensions enabled by default: %default.\n'
|
'of the standard markdown format. The extensions enabled by default: %default.\n'
|
||||||
'To learn more about markdown extensions, see https://pythonhosted.org/Markdown/extensions/index.html\n'
|
'To learn more about markdown extensions, see https://pythonhosted.org/Markdown/extensions/index.html\n'
|
||||||
'This should be a comma separated list of extensions to enable:\n') +
|
'This should be a comma separated list of extensions to enable:\n'
|
||||||
'\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
|
) + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
def shift_file(self, base_dir, fname, data):
|
||||||
|
name, ext = os.path.splitext(fname)
|
||||||
|
c = 1
|
||||||
|
while os.path.exists(os.path.join(base_dir, '{}-{}{}'.format(name, c, ext))):
|
||||||
|
c += 1
|
||||||
|
ans = os.path.join(base_dir, '{}-{}{}'.format(name, c, ext))
|
||||||
|
with open(ans, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
return f.name
|
||||||
|
|
||||||
|
def fix_resources(self, html, base_dir):
|
||||||
|
from html5_parser import parse
|
||||||
|
root = parse(html)
|
||||||
|
changed = False
|
||||||
|
for img in root.xpath('//img[@src]'):
|
||||||
|
src = img.get('src')
|
||||||
|
prefix = src.split(':', 1)[0].lower()
|
||||||
|
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
|
||||||
|
src = os.path.join(base_dir, src)
|
||||||
|
if os.access(src, os.R_OK):
|
||||||
|
with open(src, 'rb') as f:
|
||||||
|
data = f.read()
|
||||||
|
f = self.shift_file(base_dir, os.path.basename(src), data)
|
||||||
|
changed = True
|
||||||
|
img.set('src', os.path.basename(f))
|
||||||
|
if changed:
|
||||||
|
from lxml import etree
|
||||||
|
html = etree.tostring(root, encoding='unicode')
|
||||||
|
return html
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||||
@ -87,6 +117,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = ''
|
txt = ''
|
||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
length = 0
|
length = 0
|
||||||
|
base_dir = os.getcwdu()
|
||||||
|
|
||||||
# Extract content from zip archive.
|
# Extract content from zip archive.
|
||||||
if file_ext == 'txtz':
|
if file_ext == 'txtz':
|
||||||
@ -98,6 +129,8 @@ class TXTInput(InputFormatPlugin):
|
|||||||
with open(x, 'rb') as tf:
|
with open(x, 'rb') as tf:
|
||||||
txt += tf.read() + '\n\n'
|
txt += tf.read() + '\n\n'
|
||||||
else:
|
else:
|
||||||
|
if getattr(stream, 'name', None):
|
||||||
|
base_dir = os.path.dirname(stream.name)
|
||||||
txt = stream.read()
|
txt = stream.read()
|
||||||
if file_ext in {'md', 'textile', 'markdown'}:
|
if file_ext in {'md', 'textile', 'markdown'}:
|
||||||
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
|
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
|
||||||
@ -194,47 +227,42 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = preserve_spaces(txt)
|
txt = preserve_spaces(txt)
|
||||||
|
|
||||||
# Process the text using the appropriate text processor.
|
# Process the text using the appropriate text processor.
|
||||||
html = ''
|
self.shifted_files = []
|
||||||
input_mi = None
|
try:
|
||||||
if options.formatting_type == 'markdown':
|
html = ''
|
||||||
log.debug('Running text through markdown conversion...')
|
input_mi = None
|
||||||
try:
|
if options.formatting_type == 'markdown':
|
||||||
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
|
log.debug('Running text through markdown conversion...')
|
||||||
except RuntimeError:
|
try:
|
||||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
|
||||||
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
|
except RuntimeError:
|
||||||
elif options.formatting_type == 'textile':
|
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||||
log.debug('Running text through textile conversion...')
|
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
|
||||||
html = convert_textile(txt)
|
html = self.fix_resources(html, base_dir)
|
||||||
else:
|
elif options.formatting_type == 'textile':
|
||||||
log.debug('Running text through basic conversion...')
|
log.debug('Running text through textile conversion...')
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
html = convert_textile(txt)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
html = self.fix_resources(html, base_dir)
|
||||||
|
else:
|
||||||
|
log.debug('Running text through basic conversion...')
|
||||||
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
# Run the HTMLized text through the html processing plugin.
|
# Run the HTMLized text through the html processing plugin.
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
html_input = plugin_for_input_format('html')
|
html_input = plugin_for_input_format('html')
|
||||||
for opt in html_input.options:
|
for opt in html_input.options:
|
||||||
setattr(options, opt.option.name, opt.recommended_value)
|
setattr(options, opt.option.name, opt.recommended_value)
|
||||||
options.input_encoding = 'utf-8'
|
options.input_encoding = 'utf-8'
|
||||||
base = os.getcwdu()
|
htmlfile = self.shift_file(base_dir, 'index.html', html.encode('utf-8'))
|
||||||
if file_ext != 'txtz' and hasattr(stream, 'name'):
|
odi = options.debug_pipeline
|
||||||
base = os.path.dirname(stream.name)
|
options.debug_pipeline = None
|
||||||
fname = os.path.join(base, 'index.html')
|
# Generate oeb from html conversion.
|
||||||
c = 0
|
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
|
||||||
while os.path.exists(fname):
|
options.debug_pipeline = odi
|
||||||
c += 1
|
finally:
|
||||||
fname = 'index%d.html'%c
|
for x in self.shifted_files:
|
||||||
htmlfile = open(fname, 'wb')
|
os.remove(x)
|
||||||
with htmlfile:
|
|
||||||
htmlfile.write(html.encode('utf-8'))
|
|
||||||
odi = options.debug_pipeline
|
|
||||||
options.debug_pipeline = None
|
|
||||||
# Generate oeb from html conversion.
|
|
||||||
oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
|
|
||||||
{})
|
|
||||||
options.debug_pipeline = odi
|
|
||||||
os.remove(htmlfile.name)
|
|
||||||
|
|
||||||
# Set metadata from file.
|
# Set metadata from file.
|
||||||
if input_mi is None:
|
if input_mi is None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user