Edit book: Check book, detect case mismatches between links and the filenames of the files they point to and offer to auto-correct the links

Works on both case-insensitive and case-sensitive file systems
2025-07-09 03:04:10 -04:00 · 2014-02-02 21:15:20 +05:30 · 2014-02-02 21:15:20 +05:30 · 91e12f10ca
commit 91e12f10ca
parent 6ae332ff87
3 changed files with 107 additions and 4 deletions
--- a/src/calibre/ebooks/oeb/polish/check/links.py
+++ b/src/calibre/ebooks/oeb/polish/check/links.py
@ -6,12 +6,13 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

+import os
 from collections import defaultdict
 from urlparse import urlparse

 from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
 from calibre.ebooks.oeb.polish.container import OEB_FONTS
-from calibre.ebooks.oeb.polish.utils import guess_type
+from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name
 from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, INFO

 class BadLink(BaseError):
@ -20,6 +21,34 @@ class BadLink(BaseError):
             ' either fix, or remove the link.')
    level = WARN

+class CaseMismatch(BadLink):
+
+    def __init__(self, href, corrected_name, name, lnum, col):
+        BadLink.__init__(self, _('The linked to resource {0} does not exist').format(href), name, line=lnum, col=col)
+        self.HELP = _('The case of the link {0} and the case of the actual file it points to {1}'
+                      ' do not agree. You should change either the case of the link or rename the file.').format(
+                          href, corrected_name)
+        self.INDIVIDUAL_FIX = _('Change the case of the link to match the actual file')
+        self.corrected_name = corrected_name
+        self.href = href
+
+    def __call__(self, container):
+        frag = urlparse(self.href).fragment
+        nhref = container.name_to_href(self.corrected_name, self.name)
+        if frag:
+            nhref += '#' + frag
+        orig_href = self.href
+        class LinkReplacer(object):
+            replaced = False
+            def __call__(self, url):
+                if url != orig_href:
+                    return url
+                self.replaced = True
+                return nhref
+        replacer = LinkReplacer()
+        container.replace_links(self.name, replacer)
+        return replacer.replaced
+
 class FileLink(BadLink):

    HELP = _('This link uses the file:// URL scheme. This does not work with many ebook readers.'
@ -128,9 +157,20 @@ def check_links(container):
                        if tname in container.mime_map:
                            links_map[name].add(tname)
                        else:
-                            a(BadLink(_('The linked resource %s is a directory') % fl(href), name, lnum, col))
+                            # Filesystem says the file exists, but it is not in
+                            # the mime_map, so either there is a case mismatch
+                            # or the link is a directory
+                            apath = container.name_to_abspath(tname)
+                            if os.path.isdir(apath):
+                                a(BadLink(_('The linked resource %s is a directory') % fl(href), name, lnum, col))
+                            else:
+                                a(CaseMismatch(href, actual_case_for_name(container, tname), name, lnum, col))
                    else:
-                        a(BadLink(_('The linked resource %s does not exist') % fl(href), name, lnum, col))
+                        cname = corrected_case_for_name(container, tname)
+                        if cname is not None:
+                            a(CaseMismatch(href, cname, name, lnum, col))
+                        else:
+                            a(BadLink(_('The linked resource %s does not exist') % fl(href), name, lnum, col))
                else:
                    purl = urlparse(href)
                    if purl.scheme == 'file':
--- a/src/calibre/ebooks/oeb/polish/tests/container.py
+++ b/src/calibre/ebooks/oeb/polish/tests/container.py
@ -185,6 +185,23 @@ class ContainerTests(BaseTest):

        self.check_links(c)

+    def test_actual_case(self):
+        ' Test getting the actual case for files from names on case insensitive filesystems '
+        from calibre.ebooks.oeb.polish.utils import actual_case_for_name, corrected_case_for_name
+        book = get_simple_book()
+        c = get_container(book)
+        name = 'f1/f2/added file.html'
+        c.add_file(name, b'xxx')
+        self.assertTrue(c.exists(name))
+        variations = (name, name.upper(), name.replace('f1', 'F1'), name.replace('f2', 'F2'))
+        if c.exists(name.upper()):
+            for n in variations:
+                self.assertEqual(name, actual_case_for_name(c, n))
+        else:
+            for n in variations:
+                self.assertEqual(name, corrected_case_for_name(c, n))
+            self.assertIsNone(corrected_case_for_name(c, name+'/xx'))
+
    def test_split_file(self):
        ' Test splitting of files '
        book = get_split_book()
--- a/src/calibre/ebooks/oeb/polish/utils.py
+++ b/src/calibre/ebooks/oeb/polish/utils.py
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

-import re
+import re, os
 from bisect import bisect

 from calibre import guess_type as _guess_type
@ -21,6 +21,52 @@ def setup_cssutils_serialization(tab_width=2):
    prefs.indentClosingBrace = False
    prefs.omitLastSemicolon = False

+def actual_case_for_name(container, name):
+    from calibre.utils.filenames import samefile
+    if not container.exists(name):
+        raise ValueError('Cannot get actual case for %s as it does not exist' % name)
+    parts = name.split('/')
+    base = ''
+    ans = []
+    for i, x in enumerate(parts):
+        base = '/'.join(ans + [x])
+        path = container.name_to_abspath(base)
+        pdir = os.path.dirname(path)
+        candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
+        if x in candidates:
+            correctx = x
+        else:
+            for q in candidates:
+                if samefile(q, path):
+                    correctx = os.path.basename(q)
+                    break
+            else:
+                raise RuntimeError('Something bad happened')
+        ans.append(correctx)
+    return '/'.join(ans)
+
+def corrected_case_for_name(container, name):
+    parts = name.split('/')
+    ans = []
+    base = ''
+    for i, x in enumerate(parts):
+        base = '/'.join(ans + [x])
+        if container.exists(base):
+            correctx = x
+        else:
+            try:
+                candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
+            except EnvironmentError:
+                return None  # one of the non-terminal components of name is a file instead of a directory
+            for q in candidates:
+                if q.lower() == x.lower():
+                    correctx = q
+                    break
+            else:
+                return None
+        ans.append(correctx)
+    return '/'.join(ans)
+
 class PositionFinder(object):

    def __init__(self, raw):