From 657fa4d6c2f3e9d666c8484a259dd10ff46649a4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Dec 2008 23:56:56 -0800
Subject: [PATCH] Remove bad links from HTML files before creating EPUB

---
 src/calibre/ebooks/epub/from_html.py | 46 ++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index a5e5f51004..ae2fa5eae1 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -32,14 +32,15 @@ Conversion of HTML/OPF files follows several stages:
     * The EPUB container is created.
 '''
 
-import os, sys, cStringIO, logging, re
+import os, sys, cStringIO, logging, re, functools
 
 from lxml.etree import XPath
+from lxml import html
 from PyQt4.Qt import QApplication, QPixmap
 
 from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
-    opf_traverse, create_metadata, rebase_toc
-from calibre.ebooks.epub import config as common_config
+    opf_traverse, create_metadata, rebase_toc, Link
+from calibre.ebooks.epub import config as common_config, tostring
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata.opf2 import OPF
@@ -48,6 +49,44 @@ from calibre.ebooks.epub.split import split
 from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding
 from calibre import walk
+from calibre import CurrentDir
+
+content = functools.partial(os.path.join, u'content')
+
+def remove_bad_link(element, attribute, link, pos):
+    if attribute is not None:
+        if element.tag in ['link', 'img']:
+            element.getparent().remove(element)
+        else:
+            element.set(attribute, '')
+            del element.attrib[attribute]
+
+def check(opf_path, pretty_print):
+    '''
+    Find a remove all invalid links in the HTML files 
+    '''
+    print '\tChecking files for bad links...'
+    pathtoopf = os.path.abspath(opf_path)
+    with CurrentDir(os.path.dirname(pathtoopf)):
+        opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
+        html_files = []
+        for item in opf.itermanifest():
+            if 'html' in item.get('media-type', '').lower():
+                f = item.get('href').split('/')[-1].decode('utf-8')
+                html_files.append(os.path.abspath(content(f)))
+        
+        for path in html_files:
+            base = os.path.dirname(path)
+            root = html.fromstring(open(content(path), 'rb').read())
+            for element, attribute, link, pos in list(root.iterlinks()):
+                link = link.decode('utf-8')
+                plink = Link(link, base)
+                bad = False
+                if plink.path is not None and not os.path.exists(plink.path):
+                    bad = True
+                if bad:
+                    remove_bad_link(element, attribute, link, pos)
+            open(content(path), 'wb').write(tostring(root, pretty_print))
 
 def find_html_index(files):
     '''
@@ -316,6 +355,7 @@ def convert(htmlfile, opts, notification=None):
             if opts.show_ncx:
                 print toc
         split(opf_path, opts, stylesheet_map)
+        check(opf_path, opts.pretty_print)
         opf = OPF(opf_path, tdir)
         opf.remove_guide()
         if has_title_page: