From 3f903cbdd165e0d1c5c25eecb6eef2a998342230 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 5 Jun 2019 09:08:17 +0530 Subject: [PATCH] CHM Input: Fix a regression that broke processing of some CHM files Fixes #1831511 [Can't open chm file](https://bugs.launchpad.net/calibre/+bug/1831511) Bug was introduced due to adding of unicode_literals. Also fix CHM input on python3 --- setup/extensions.json | 2 +- src/calibre/ebooks/chm/reader.py | 26 ++++++++++++++++---------- src/calibre/utils/chm/chm.py | 16 ++++++++-------- src/calibre/utils/chm/swig_chm.c | 12 ++++++------ src/calibre/utils/chm/swig_chm.i | 26 +++++++++++++------------- 5 files changed, 44 insertions(+), 38 deletions(-) diff --git a/setup/extensions.json b/setup/extensions.json index a43568541d..b0698d4cec 100644 --- a/setup/extensions.json +++ b/setup/extensions.json @@ -84,7 +84,7 @@ "windows_libraries": "ChmLib", "inc_dirs": "!chmlib_inc_dirs", "lib_dirs": "!chmlib_lib_dirs", - "defines": "SWIG_COBJECT_TYPES" + "defines": "SWIG_COBJECT_TYPES SWIG_PYTHON_STRICT_BYTE_CHAR" }, { "name": "lzx", diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index 3759310fd7..983526232a 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ,' \ ' and Alex Bramley .' -import os, re, codecs +import os, re from calibre import guess_type as guess_mimetype from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString @@ -15,7 +15,7 @@ from calibre.utils.chm.chm import CHMFile from calibre.constants import plugins from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.chardet import xml_to_unicode -from polyglot.builtins import unicode_type, getcwd +from polyglot.builtins import unicode_type, getcwd, as_unicode chmlib, chmlib_err = plugins['chmlib'] @@ -56,12 +56,17 @@ class CHMReader(CHMFile): raise CHMError("Unable to open CHM file '%s'"%(input,)) self.log = log self.input_encoding = input_encoding + self.chm_encoding = self.get_encoding() or 'cp1252' self._sourcechm = input self._contents = None self._playorder = 0 self._metadata = False self._extracted = False self.re_encoded_files = set() + if self.home: + self.home = as_unicode(self.home, self.chm_encoding) + if self.topics: + self.topics = as_unicode(self.topics, self.chm_encoding) # location of '.hhc' file, which is the CHM TOC. if self.topics is None: @@ -91,6 +96,11 @@ class CHMReader(CHMFile): # print toc return toc + def ResolveObject(self, path): + if not isinstance(path, bytes): + path = path.encode(self.chm_encoding) + return CHMFile.ResolveObject(self, path) + def GetFile(self, path): # have to have abs paths for ResolveObject, but Contents() deliberately # makes them relative. So we don't have to worry, re-add the leading /. @@ -107,12 +117,7 @@ class CHMReader(CHMFile): def ExtractFiles(self, output_dir=getcwd(), debug_dump=False): html_files = set() - try: - x = self.get_encoding() - codecs.lookup(x) - enc = x - except: - enc = 'cp1252' + enc = self.chm_encoding for path in self.Contents(): fpath = path if not isinstance(path, unicode_type): @@ -275,11 +280,12 @@ class CHMReader(CHMFile): paths = [] def get_paths(chm, ui, ctx): + path = as_unicode(ui.path, self.chm_encoding) # skip directories # note this path refers to the internal CHM structure - if ui.path[-1] != '/': + if path[-1] != '/': # and make paths relative - paths.append(ui.path.lstrip('/')) + paths.append(path.lstrip('/')) chmlib.chm_enumerate(self.file, chmlib.CHM_ENUMERATE_NORMAL, get_paths, None) self._contents = paths return self._contents diff --git a/src/calibre/utils/chm/chm.py b/src/calibre/utils/chm/chm.py index eb0594879c..7be92acaa5 100644 --- a/src/calibre/utils/chm/chm.py +++ b/src/calibre/utils/chm/chm.py @@ -30,7 +30,7 @@ import codecs import struct import sys -from calibre.constants import plugins +from calibre.constants import plugins, filesystem_encoding from polyglot.builtins import long_type chmlib, chmlib_err = plugins['chmlib'] @@ -226,7 +226,10 @@ class CHMFile: if self.filename is not None: self.CloseCHM() - self.file = chmlib.chm_open(archiveName) + path = archiveName + if not isinstance(path, bytes): + path = path.encode(filesystem_encoding) + self.file = chmlib.chm_open(path) if self.file is None: return 0 @@ -382,11 +385,8 @@ class CHMFile: The UnitInfo is used to retrieve the document contents ''' if self.file: - # path = os.path.abspath(document) - path = document - return chmlib.chm_resolve_object(self.file, path) - else: - return (1, None) + return chmlib.chm_resolve_object(self.file, document) + return 1, None def RetrieveObject(self, ui, start=-1, length=-1): '''Retrieves the contents of a document. @@ -442,7 +442,7 @@ class CHMFile: if ans: try: codecs.lookup(ans) - except: + except Exception: ans = None return ans diff --git a/src/calibre/utils/chm/swig_chm.c b/src/calibre/utils/chm/swig_chm.c index 2c80025863..1642fe7754 100644 --- a/src/calibre/utils/chm/swig_chm.c +++ b/src/calibre/utils/chm/swig_chm.c @@ -3827,18 +3827,18 @@ int dummy_enumerator (struct chmFile *h, if (arglist) { result = PyEval_CallObject(my_callback, arglist); Py_DECREF(arglist); - Py_DECREF(result); Py_DECREF(py_h); Py_DECREF(py_ui); if (result == NULL) { - return 0; /* Pass error back */ - } else { - return 1; + PyErr_Print(); + return 0; /* Pass error back */ } - } else - return 0; + Py_DECREF(result); + return 1; + } + return 0; } diff --git a/src/calibre/utils/chm/swig_chm.i b/src/calibre/utils/chm/swig_chm.i index f9b4191cf4..bbcbdbe698 100644 --- a/src/calibre/utils/chm/swig_chm.i +++ b/src/calibre/utils/chm/swig_chm.i @@ -60,20 +60,20 @@ int dummy_enumerator (struct chmFile *h, /* Time to call the callback */ arglist = Py_BuildValue("(OOO)", py_h, py_ui, py_c); if (arglist) { - result = PyEval_CallObject(my_callback, arglist); - Py_DECREF(arglist); - Py_DECREF(result); - - Py_DECREF(py_h); - Py_DECREF(py_ui); - - if (result == NULL) { - return 0; /* Pass error back */ - } else { + result = PyEval_CallObject(my_callback, arglist); + Py_DECREF(arglist); + + Py_DECREF(py_h); + Py_DECREF(py_ui); + + if (result == NULL) { + PyErr_Print(); + return 0; /* Pass error back */ + } + Py_DECREF(result); return 1; - } - } else - return 0; + } + return 0; } %}