Remove bad links from HTML files before creating EPUB

2025-07-09 03:04:10 -04:00 · 2008-12-03 23:56:56 -08:00 · 2008-12-03 23:56:56 -08:00 · 657fa4d6c2
commit 657fa4d6c2
parent a7d54c1d77
1 changed files with 43 additions and 3 deletions
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -32,14 +32,15 @@ Conversion of HTML/OPF files follows several stages:
    * The EPUB container is created.
 '''

-import os, sys, cStringIO, logging, re
+import os, sys, cStringIO, logging, re, functools

 from lxml.etree import XPath
+from lxml import html
 from PyQt4.Qt import QApplication, QPixmap

 from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
-    opf_traverse, create_metadata, rebase_toc
-from calibre.ebooks.epub import config as common_config
+    opf_traverse, create_metadata, rebase_toc, Link
+from calibre.ebooks.epub import config as common_config, tostring
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata.opf2 import OPF
@ -48,6 +49,44 @@ from calibre.ebooks.epub.split import split
 from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding
 from calibre import walk
+from calibre import CurrentDir
+
+content = functools.partial(os.path.join, u'content')
+
+def remove_bad_link(element, attribute, link, pos):
+    if attribute is not None:
+        if element.tag in ['link', 'img']:
+            element.getparent().remove(element)
+        else:
+            element.set(attribute, '')
+            del element.attrib[attribute]
+
+def check(opf_path, pretty_print):
+    '''
+    Find a remove all invalid links in the HTML files 
+    '''
+    print '\tChecking files for bad links...'
+    pathtoopf = os.path.abspath(opf_path)
+    with CurrentDir(os.path.dirname(pathtoopf)):
+        opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
+        html_files = []
+        for item in opf.itermanifest():
+            if 'html' in item.get('media-type', '').lower():
+                f = item.get('href').split('/')[-1].decode('utf-8')
+                html_files.append(os.path.abspath(content(f)))
+        
+        for path in html_files:
+            base = os.path.dirname(path)
+            root = html.fromstring(open(content(path), 'rb').read())
+            for element, attribute, link, pos in list(root.iterlinks()):
+                link = link.decode('utf-8')
+                plink = Link(link, base)
+                bad = False
+                if plink.path is not None and not os.path.exists(plink.path):
+                    bad = True
+                if bad:
+                    remove_bad_link(element, attribute, link, pos)
+            open(content(path), 'wb').write(tostring(root, pretty_print))

 def find_html_index(files):
    '''
@ -316,6 +355,7 @@ def convert(htmlfile, opts, notification=None):
            if opts.show_ncx:
                print toc
        split(opf_path, opts, stylesheet_map)
+        check(opf_path, opts.pretty_print)
        opf = OPF(opf_path, tdir)
        opf.remove_guide()
        if has_title_page: