CHM Input: Fix handling of relative file paths in <img> tags. Fixes #7159 (Large CHM file fails conversion)

2025-07-09 03:04:10 -04:00 · 2010-10-17 09:05:40 -06:00 · 2010-10-17 09:05:40 -06:00 · 30b3d2a564
commit 30b3d2a564
parent ca4953f028
1 changed files with 30 additions and 12 deletions
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -93,6 +93,7 @@ class CHMReader(CHMFile):
        return data

    def ExtractFiles(self, output_dir=os.getcwdu()):
+        html_files = set([])
        for path in self.Contents():
            lpath = os.path.join(output_dir, path)
            self._ensure_dir(lpath)
@ -106,14 +107,27 @@ class CHMReader(CHMFile):
                lpath = lpath.split(';')[0]
            try:
                with open(lpath, 'wb') as f:
-                    if guess_mimetype(path)[0] == ('text/html'):
-                        data = self._reformat(data)
                    f.write(data)
+                try:
+                    if 'html' in guess_mimetype(path)[0]:
+                        html_files.add(lpath)
+                except:
+                    pass
            except:
                if iswindows and len(lpath) > 250:
                    self.log.warn('%r filename too long, skipping'%path)
                    continue
                raise
+        for lpath in html_files:
+            with open(lpath, 'r+b') as f:
+                data = f.read()
+                data = self._reformat(data, lpath)
+                if isinstance(data, unicode):
+                    data = data.encode('utf-8')
+                f.seek(0)
+                f.truncate()
+                f.write(data)
+
        self._extracted = True
        files = [x for x in os.listdir(output_dir) if
                os.path.isfile(os.path.join(output_dir, x))]
@ -125,7 +139,7 @@ class CHMReader(CHMFile):
        if self.hhc_path not in files and files:
            self.hhc_path = files[0]

-    def _reformat(self, data):
+    def _reformat(self, data, htmlpath):
        try:
            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
            soup = BeautifulSoup(data)
@ -169,15 +183,19 @@ class CHMReader(CHMFile):
                br[0].extract()

        # some images seem to be broken in some chm's :/
-        for img in soup('img'):
-            try:
-                # some are supposedly "relative"... lies.
-                while img['src'].startswith('../'): img['src'] = img['src'][3:]
-                # some have ";<junk>" at the end.
-                img['src'] = img['src'].split(';')[0]
-            except KeyError:
-                # and some don't even have a src= ?!
-                pass
+        base = os.path.dirname(htmlpath)
+        for img in soup('img', src=True):
+            src = img['src']
+            ipath = os.path.join(base, *src.split('/'))
+            if os.path.exists(ipath):
+                continue
+            src = src.split(';')[0]
+            if not src: continue
+            ipath = os.path.join(base, *src.split('/'))
+            if not os.path.exists(ipath):
+                while src.startswith('../'):
+                    src = src[3:]
+            img['src'] = src
        try:
            # if there is only a single table with a single element
            # in the body, replace it by the contents of this single element