Conversion pipeline: Respect UTF-8/32 BOM mark when decoding files in addition to UTF-16 BOM. Fixes #4025 (ebook convert chokes on a complex CSS file)

2026-01-07 20:50:20 -05:00 · 2009-11-15 19:37:14 -07:00 · 2009-11-15 19:37:14 -07:00 · e7620fb173
commit e7620fb173
parent cb5107463f
1 changed files with 10 additions and 2 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1746,9 +1746,17 @@ class OEBBook(object):
            return d.replace('\r\n', '\n').replace('\r', '\n')
        if isinstance(data, unicode):
            return fix_data(data)
-        if data[:2] in ('\xff\xfe', '\xfe\xff'):
+        bom_enc = None
+        if data[:4] in ('\0\0\xfe\xff', '\xff\xfe\0\0'):
+            bom_enc = {'\0\0\xfe\xff':'utf-32-be',
+                    '\xff\xfe\0\0':'utf-32-le'}[data[:4]]
+        elif data[:2] in ('\xff\xfe', '\xfe\xff'):
+            bom_enc = {'\xff\xfe':'utf-16-le', '\xfe\xff':'utf-16-be'}[data[:2]]
+        elif data[:3] == '\xef\xbb\xbf':
+            bom_enc = 'utf-8'
+        if bom_enc is not None:
            try:
-                return fix_data(data.decode('utf-16'))
+                return fix_data(data.decode(bom_enc))
            except UnicodeDecodeError:
                pass
        if self.input_encoding is not None: