From bff950e52d79cc674cc9f5ad4a457e8f4b2975d0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 30 Apr 2018 11:53:48 +0530 Subject: [PATCH] TXT Input: Add support for embedded images that use relative URLs when converting markdown or textile. Note that this will only work if you are converting using the ebook-convert command line tool as the main calibre program moves files around, so relative references will not be valid. --- .../ebooks/conversion/plugins/txt_input.py | 112 +++++++++++------- 1 file changed, 70 insertions(+), 42 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index f8026ca194..d33b2cd4c3 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -67,10 +67,40 @@ class TXTInput(InputFormatPlugin): help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part ' 'of the standard markdown format. The extensions enabled by default: %default.\n' 'To learn more about markdown extensions, see https://pythonhosted.org/Markdown/extensions/index.html\n' - 'This should be a comma separated list of extensions to enable:\n') + - '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))), + 'This should be a comma separated list of extensions to enable:\n' + ) + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))), ]) + def shift_file(self, base_dir, fname, data): + name, ext = os.path.splitext(fname) + c = 1 + while os.path.exists(os.path.join(base_dir, '{}-{}{}'.format(name, c, ext))): + c += 1 + ans = os.path.join(base_dir, '{}-{}{}'.format(name, c, ext)) + with open(ans, 'wb') as f: + f.write(data) + return f.name + + def fix_resources(self, html, base_dir): + from html5_parser import parse + root = parse(html) + changed = False + for img in root.xpath('//img[@src]'): + src = img.get('src') + prefix = src.split(':', 1)[0].lower() + if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src): + src = os.path.join(base_dir, src) + if os.access(src, os.R_OK): + with open(src, 'rb') as f: + data = f.read() + f = self.shift_file(base_dir, os.path.basename(src), data) + changed = True + img.set('src', os.path.basename(f)) + if changed: + from lxml import etree + html = etree.tostring(root, encoding='unicode') + return html + def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator @@ -87,6 +117,7 @@ class TXTInput(InputFormatPlugin): txt = '' log.debug('Reading text from file...') length = 0 + base_dir = os.getcwdu() # Extract content from zip archive. if file_ext == 'txtz': @@ -98,6 +129,8 @@ class TXTInput(InputFormatPlugin): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: + if getattr(stream, 'name', None): + base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext) @@ -194,47 +227,42 @@ class TXTInput(InputFormatPlugin): txt = preserve_spaces(txt) # Process the text using the appropriate text processor. - html = '' - input_mi = None - if options.formatting_type == 'markdown': - log.debug('Running text through markdown conversion...') - try: - input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()]) - except RuntimeError: - raise ValueError('This txt file has malformed markup, it cannot be' - ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax') - elif options.formatting_type == 'textile': - log.debug('Running text through textile conversion...') - html = convert_textile(txt) - else: - log.debug('Running text through basic conversion...') - flow_size = getattr(options, 'flow_size', 0) - html = convert_basic(txt, epub_split_size_kb=flow_size) + self.shifted_files = [] + try: + html = '' + input_mi = None + if options.formatting_type == 'markdown': + log.debug('Running text through markdown conversion...') + try: + input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()]) + except RuntimeError: + raise ValueError('This txt file has malformed markup, it cannot be' + ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax') + html = self.fix_resources(html, base_dir) + elif options.formatting_type == 'textile': + log.debug('Running text through textile conversion...') + html = convert_textile(txt) + html = self.fix_resources(html, base_dir) + else: + log.debug('Running text through basic conversion...') + flow_size = getattr(options, 'flow_size', 0) + html = convert_basic(txt, epub_split_size_kb=flow_size) - # Run the HTMLized text through the html processing plugin. - from calibre.customize.ui import plugin_for_input_format - html_input = plugin_for_input_format('html') - for opt in html_input.options: - setattr(options, opt.option.name, opt.recommended_value) - options.input_encoding = 'utf-8' - base = os.getcwdu() - if file_ext != 'txtz' and hasattr(stream, 'name'): - base = os.path.dirname(stream.name) - fname = os.path.join(base, 'index.html') - c = 0 - while os.path.exists(fname): - c += 1 - fname = 'index%d.html'%c - htmlfile = open(fname, 'wb') - with htmlfile: - htmlfile.write(html.encode('utf-8')) - odi = options.debug_pipeline - options.debug_pipeline = None - # Generate oeb from html conversion. - oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, - {}) - options.debug_pipeline = odi - os.remove(htmlfile.name) + # Run the HTMLized text through the html processing plugin. + from calibre.customize.ui import plugin_for_input_format + html_input = plugin_for_input_format('html') + for opt in html_input.options: + setattr(options, opt.option.name, opt.recommended_value) + options.input_encoding = 'utf-8' + htmlfile = self.shift_file(base_dir, 'index.html', html.encode('utf-8')) + odi = options.debug_pipeline + options.debug_pipeline = None + # Generate oeb from html conversion. + oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) + options.debug_pipeline = odi + finally: + for x in self.shifted_files: + os.remove(x) # Set metadata from file. if input_mi is None: