CHM Input: Fix handling of relative file paths in <img> tags. Fixes #7159 (Large CHM file fails conversion)

2025-07-09 03:04:10 -04:00 · 2010-10-17 09:05:40 -06:00 · 2010-10-17 09:05:40 -06:00 · 30b3d2a564
commit 30b3d2a564
parent ca4953f028
1 changed files with 30 additions and 12 deletions
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -93,6 +93,7 @@ class CHMReader(CHMFile):
        return data
    def ExtractFiles(self, output_dir=os.getcwdu()):
        html_files = set([])
        for path in self.Contents():
            lpath = os.path.join(output_dir, path)
            self._ensure_dir(lpath)
@ -106,14 +107,27 @@ class CHMReader(CHMFile):
                lpath = lpath.split(';')[0]
            try:
                with open(lpath, 'wb') as f:
                    if guess_mimetype(path)[0] == ('text/html'):
                        data = self._reformat(data)
                    f.write(data)
                try:
                    if 'html' in guess_mimetype(path)[0]:
                        html_files.add(lpath)
                except:
                    pass
            except:
                if iswindows and len(lpath) > 250:
                    self.log.warn('%r filename too long, skipping'%path)
                    continue
                raise
        for lpath in html_files:
            with open(lpath, 'r+b') as f:
                data = f.read()
                data = self._reformat(data, lpath)
                if isinstance(data, unicode):
                    data = data.encode('utf-8')
                f.seek(0)
                f.truncate()
                f.write(data)
        self._extracted = True
        files = [x for x in os.listdir(output_dir) if
                os.path.isfile(os.path.join(output_dir, x))]
@ -125,7 +139,7 @@ class CHMReader(CHMFile):
        if self.hhc_path not in files and files:
            self.hhc_path = files[0]
-    def _reformat(self, data):
+    def _reformat(self, data, htmlpath):
        try:
            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
            soup = BeautifulSoup(data)
@ -169,15 +183,19 @@ class CHMReader(CHMFile):
                br[0].extract()
        # some images seem to be broken in some chm's :/
-        for img in soup('img'):
+        base = os.path.dirname(htmlpath)
-            try:
+        for img in soup('img', src=True):
-                # some are supposedly "relative"... lies.
+            src = img['src']
-                while img['src'].startswith('../'): img['src'] = img['src'][3:]
+            ipath = os.path.join(base, *src.split('/'))
-                # some have ";<junk>" at the end.
+            if os.path.exists(ipath):
-                img['src'] = img['src'].split(';')[0]
+                continue
-            except KeyError:
+            src = src.split(';')[0]
-                # and some don't even have a src= ?!
+            if not src: continue
-                pass
+            ipath = os.path.join(base, *src.split('/'))
            if not os.path.exists(ipath):
                while src.startswith('../'):
                    src = src[3:]
            img['src'] = src
        try:
            # if there is only a single table with a single element
            # in the body, replace it by the contents of this single element