When dealing with ZIP/RAR archives, use the file header rather than the file extension to detrmine the file type, when possible. This fixes the common case of CBZ files being actually cbr files and vice versa

2025-12-02 11:15:04 -05:00 · 2011-04-06 10:03:46 -06:00 · 2011-04-06 10:03:46 -06:00 · 504ef95056
commit 504ef95056
parent 593f3aaf0a
2 changed files with 24 additions and 5 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -217,8 +217,19 @@ def filename_to_utf8(name):
    return name.decode(codec, 'replace').encode('utf8')

 def extract(path, dir):
-    ext = os.path.splitext(path)[1][1:].lower()
    extractor = None
+    # First use the file header to identify its type
+    with open(path, 'rb') as f:
+        id_ = f.read(3)
+    if id_ == b'Rar':
+        from calibre.libunrar import extract as rarextract
+        extractor = rarextract
+    elif id_.startswith(b'PK'):
+        from calibre.libunzip import extract as zipextract
+        extractor = zipextract
+    if extractor is None:
+        # Fallback to file extension
+        ext = os.path.splitext(path)[1][1:].lower()
        if ext in ['zip', 'cbz', 'epub', 'oebzip']:
            from calibre.libunzip import extract as zipextract
            extractor = zipextract
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -166,6 +166,14 @@ class ComicMetadataReader(MetadataReaderPlugin):
    description = _('Extract cover from comic files')

    def get_metadata(self, stream, ftype):
+        if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
+            pos = stream.tell()
+            id_ = stream.read(3)
+            stream.seek(pos)
+            if id_ == b'Rar':
+                ftype = 'cbr'
+            elif id.startswith(b'PK'):
+                ftype = 'cbz'
        if ftype == 'cbr':
            from calibre.libunrar import extract_first_alphabetically as extract_first
            extract_first