From 30b3d2a56485b01d68e18ad0a5a0dd94909e0fb7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 17 Oct 2010 09:05:40 -0600
Subject: [PATCH] CHM Input: Fix handling of relative file paths in <img> tags.
 Fixes #7159 (Large CHM file fails conversion)

---
 src/calibre/ebooks/chm/reader.py | 42 +++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 12 deletions(-)
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 73587edfa4..025e252005 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -93,6 +93,7 @@ class CHMReader(CHMFile):
         return data
 
     def ExtractFiles(self, output_dir=os.getcwdu()):
+        html_files = set([])
         for path in self.Contents():
             lpath = os.path.join(output_dir, path)
             self._ensure_dir(lpath)
@@ -106,14 +107,27 @@ class CHMReader(CHMFile):
                 lpath = lpath.split(';')[0]
             try:
                 with open(lpath, 'wb') as f:
-                    if guess_mimetype(path)[0] == ('text/html'):
-                        data = self._reformat(data)
                     f.write(data)
+                try:
+                    if 'html' in guess_mimetype(path)[0]:
+                        html_files.add(lpath)
+                except:
+                    pass
             except:
                 if iswindows and len(lpath) > 250:
                     self.log.warn('%r filename too long, skipping'%path)
                     continue
                 raise
+        for lpath in html_files:
+            with open(lpath, 'r+b') as f:
+                data = f.read()
+                data = self._reformat(data, lpath)
+                if isinstance(data, unicode):
+                    data = data.encode('utf-8')
+                f.seek(0)
+                f.truncate()
+                f.write(data)
+
         self._extracted = True
         files = [x for x in os.listdir(output_dir) if
                 os.path.isfile(os.path.join(output_dir, x))]
@@ -125,7 +139,7 @@ class CHMReader(CHMFile):
         if self.hhc_path not in files and files:
             self.hhc_path = files[0]
 
-    def _reformat(self, data):
+    def _reformat(self, data, htmlpath):
         try:
             data = xml_to_unicode(data, strip_encoding_pats=True)[0]
             soup = BeautifulSoup(data)
@@ -169,15 +183,19 @@ class CHMReader(CHMFile):
                 br[0].extract()
 
         # some images seem to be broken in some chm's :/
-        for img in soup('img'):
-            try:
-                # some are supposedly "relative"... lies.
-                while img['src'].startswith('../'): img['src'] = img['src'][3:]
-                # some have ";<junk>" at the end.
-                img['src'] = img['src'].split(';')[0]
-            except KeyError:
-                # and some don't even have a src= ?!
-                pass
+        base = os.path.dirname(htmlpath)
+        for img in soup('img', src=True):
+            src = img['src']
+            ipath = os.path.join(base, *src.split('/'))
+            if os.path.exists(ipath):
+                continue
+            src = src.split(';')[0]
+            if not src: continue
+            ipath = os.path.join(base, *src.split('/'))
+            if not os.path.exists(ipath):
+                while src.startswith('../'):
+                    src = src[3:]
+            img['src'] = src
         try:
             # if there is only a single table with a single element
             # in the body, replace it by the contents of this single element