Make HTML filename sanitization more robust

This commit is contained in:
Kovid Goyal 2021-03-17 09:23:39 +05:30
parent 73a8699e0a
commit 4955d632f3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -6,21 +6,27 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, tempfile, os import os
import re
import tempfile
from functools import partial from functools import partial
from urllib.parse import quote
from calibre.constants import islinux, isbsd from calibre.constants import isbsd, islinux
from calibre.customize.conversion import (InputFormatPlugin, from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
OptionRecommendation)
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
from polyglot.builtins import unicode_type, zip, getcwd, as_unicode from calibre.utils.localization import get_lang
from polyglot.builtins import as_unicode, getcwd, unicode_type, zip
def sanitize_file_name(x): def sanitize_file_name(x):
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.') ans = re.sub(r'\s+', ' ', ascii_filename(x))
ans = re.sub(r'\%2[fF]', '_', ans) for ch in '?&=;#/\\':
ans = ans.replace(ch, '_')
q = quote(ch, safe='')
ans = re.sub(f'\{q}', '_', ans, flags=re.I)
ans = ans.strip().rstrip('.')
ans, ext = ans.rpartition('.')[::2] ans, ext = ans.rpartition('.')[::2]
return (ans.strip() + '.' + ext.strip()).rstrip('.') return (ans.strip() + '.' + ext.strip()).rstrip('.')
@ -98,18 +104,20 @@ class HTMLInput(InputFormatPlugin):
return self._is_case_sensitive return self._is_case_sensitive
def create_oebbook(self, htmlpath, basedir, opts, log, mi): def create_oebbook(self, htmlpath, basedir, opts, log, mi):
import css_parser
import logging
import uuid import uuid
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import (DirContainer,
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
xpath, urlquote)
from calibre import guess_type from calibre import guess_type
from calibre.ebooks.oeb.transforms.metadata import \ from calibre.ebooks.conversion.plumber import create_oebbook
meta_info_to_oeb_metadata
from calibre.ebooks.html.input import get_filelist from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata import string_to_authors from calibre.ebooks.metadata import string_to_authors
from calibre.ebooks.oeb.base import (
BINARY_MIME, OEB_STYLES, DirContainer, rewrite_links, urldefrag,
urlnormalize, urlquote, xpath
)
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
import css_parser, logging
css_parser.log.setLevel(logging.WARN) css_parser.log.setLevel(logging.WARN)
self.OEB_STYLES = OEB_STYLES self.OEB_STYLES = OEB_STYLES
oeb = create_oebbook(log, None, opts, self, oeb = create_oebbook(log, None, opts, self,