diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 716e3913fb..5ba1aa42de 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -61,8 +61,9 @@ def osx_version(): if m: return int(m.group(1)), int(m.group(2)), int(m.group(3)) - _filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]') +_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<', + u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32)))) def sanitize_file_name(name, substitute='_', as_unicode=False): ''' @@ -85,6 +86,31 @@ def sanitize_file_name(name, substitute='_', as_unicode=False): # Windows doesn't like path components that end with a period if one.endswith('.'): one = one[:-1]+'_' + # Names starting with a period are hidden on Unix + if one.startswith('.'): + one = '_' + one[1:] + return one + +def sanitize_file_name_unicode(name, substitute='_'): + ''' + Sanitize the filename `name`. All invalid characters are replaced by `substitute`. + The set of invalid characters is the union of the invalid characters in Windows, + OS X and Linux. Also removes leading and trailing whitespace. + **WARNING:** This function also replaces path separators, so only pass file names + and not full paths to it. + ''' + if not isinstance(name, unicode): + return sanitize_file_name(name, substitute=substitute, as_unicode=True) + chars = [substitute if c in _filename_sanitize_unicode else c for c in + name] + one = u''.join(chars) + one = re.sub(r'\s', ' ', one).strip() + one = re.sub(r'^\.+$', '_', one) + one = one.replace('..', substitute) + # Windows doesn't like path components that end with a period or space + if one and one[-1] in ('.', ' '): + one = one[:-1]+'_' + # Names starting with a period are hidden on Unix if one.startswith('.'): one = '_' + one[1:] return one diff --git a/src/calibre/library/save_to_disk.py b/src/calibre/library/save_to_disk.py index de586048b7..96c42e6e0e 100644 --- a/src/calibre/library/save_to_disk.py +++ b/src/calibre/library/save_to_disk.py @@ -12,13 +12,13 @@ from calibre.constants import DEBUG from calibre.utils.config import Config, StringConfig, tweaks from calibre.utils.formatter import TemplateFormatter from calibre.utils.filenames import shorten_components_to, supports_long_names, \ - ascii_filename, sanitize_file_name + ascii_filename from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.metadata.meta import set_metadata -from calibre.constants import preferred_encoding, filesystem_encoding +from calibre.constants import preferred_encoding from calibre.ebooks.metadata import fmt_sidx from calibre.ebooks.metadata import title_sort -from calibre import strftime, prints +from calibre import strftime, prints, sanitize_file_name_unicode plugboard_any_device_value = 'any device' plugboard_any_format_value = 'any format' @@ -197,12 +197,10 @@ def get_components(template, mi, id, timefmt='%b %Y', length=250, format_args[key] = '' components = SafeFormat().safe_format(template, format_args, 'G_C-EXCEPTION!', mi) - components = [x.strip() for x in components.split('/') if x.strip()] + components = [x.strip() for x in components.split('/')] components = [sanitize_func(x) for x in components if x] if not components: components = [str(id)] - components = [x.encode(filesystem_encoding, 'replace') if isinstance(x, - unicode) else x for x in components] if to_lowercase: components = [x.lower() for x in components] if replace_whitespace: @@ -247,7 +245,7 @@ def do_save_book_to_disk(id_, mi, cover, plugboards, return True, id_, mi.title components = get_components(opts.template, mi, id_, opts.timefmt, length, - ascii_filename if opts.asciiize else sanitize_file_name, + ascii_filename if opts.asciiize else sanitize_file_name_unicode, to_lowercase=opts.to_lowercase, replace_whitespace=opts.replace_whitespace) base_path = os.path.join(root, *components) @@ -329,8 +327,6 @@ def do_save_book_to_disk(id_, mi, cover, plugboards, def _sanitize_args(root, opts): if opts is None: opts = config().parse() - if isinstance(root, unicode): - root = root.encode(filesystem_encoding) root = os.path.abspath(root) opts.template = preprocess_template(opts.template) diff --git a/src/calibre/startup.py b/src/calibre/startup.py index 41b20f3946..c883c43e8a 100644 --- a/src/calibre/startup.py +++ b/src/calibre/startup.py @@ -72,47 +72,6 @@ if not _run_once: pass ################################################################################ - # Improve builtin path functions to handle unicode sensibly - - _abspath = os.path.abspath - def my_abspath(path, encoding=sys.getfilesystemencoding()): - ''' - Work around for buggy os.path.abspath. This function accepts either byte strings, - in which it calls os.path.abspath, or unicode string, in which case it first converts - to byte strings using `encoding`, calls abspath and then decodes back to unicode. - ''' - to_unicode = False - if encoding is None: - encoding = preferred_encoding - if isinstance(path, unicode): - path = path.encode(encoding) - to_unicode = True - res = _abspath(path) - if to_unicode: - res = res.decode(encoding) - return res - - os.path.abspath = my_abspath - - _join = os.path.join - def my_join(a, *p): - encoding=sys.getfilesystemencoding() - if not encoding: - encoding = preferred_encoding - p = [a] + list(p) - _unicode = False - for i in p: - if isinstance(i, unicode): - _unicode = True - break - p = [i.encode(encoding) if isinstance(i, unicode) else i for i in p] - - res = _join(*p) - if _unicode: - res = res.decode(encoding) - return res - - os.path.join = my_join def local_open(name, mode='r', bufsize=-1): '''