diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 261f637ac5..d152107f83 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -4,8 +4,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os, glob, re -from calibre import guess_type +import os, glob from calibre.customize import (FileTypePlugin, MetadataReaderPlugin, MetadataWriterPlugin, PreferencesPlugin, InterfaceActionBase, StoreBase) from calibre.constants import numeric_version @@ -60,34 +59,8 @@ class TXT2TXTZ(FileTypePlugin): on_import = True def _get_image_references(self, txt, base_dir): - from calibre.ebooks.oeb.base import OEB_IMAGES - - images = [] - - # Textile - for m in re.finditer(r'(?mu)(?:[\[{])?\!(?:\. )?(?P[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))', txt): - path = m.group('path') - if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)): - images.append(path) - - # Markdown inline - for m in re.finditer(r'(?mu)\!\[([^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*)\]\s*\((?P[^\)]*)\)', txt): # noqa - path = m.group('path') - if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)): - images.append(path) - - # Markdown reference - refs = {} - for m in re.finditer(r'(?mu)^(\ ?\ ?\ ?)\[(?P[^\]]*)\]:\s*(?P[^\s]*)$', txt): - if m.group('id') and m.group('path'): - refs[m.group('id')] = m.group('path') - for m in re.finditer(r'(?mu)\!\[([^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*(\[[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*\])*[^\]\[]*)\]\s*\[(?P[^\]]*)\]', txt): # noqa - path = refs.get(m.group('id'), None) - if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)): - images.append(path) - - # Remove duplicates - return list(set(images)) + from calibre.ebooks.txt.processor import get_images_from_polyglot_text + return get_images_from_polyglot_text(txt, base_dir) def run(self, path_to_ebook): from calibre.ebooks.metadata.opf2 import metadata_to_opf diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 0add461b14..5a84cbc965 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -341,3 +341,30 @@ def detect_formatting_type(txt): return 'textile' return 'heuristic' + + +def get_images_from_polyglot_text(txt: str, base_dir: str = '') -> set: + from calibre.ebooks.oeb.base import OEB_IMAGES + from calibre import guess_type + if not base_dir: + base_dir = os.getcwd() + images = set() + + def check_path(path: str) -> None: + if path and not os.path.isabs(path) and guess_type(path)[0] in OEB_IMAGES and os.path.exists(os.path.join(base_dir, path)): + images.add(path) + + # Textile + for m in re.finditer(r'(?mu)(?:[\[{])?\!(?:\. )?(?P[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))', txt): + path = m.group('path') + check_path(path) + + # Markdown + from markdown import Markdown + html = HTML_TEMPLATE % ('', Markdown().convert(txt)) + from html5_parser import parse + root = parse(html) + for img in root.iterdescendants('img'): + path = img.get('src') + check_path(path) + return images