Check Book: External link checker: Also check HTML anchors (the part after the # in the link). Can be turned off via a checkbox at the bottom of the link checker window.

This commit is contained in:
Kovid Goyal 2018-06-06 08:27:12 +05:30
parent 04ba64d0d7
commit c03377adc4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 48 additions and 14 deletions

View File

@ -14,8 +14,9 @@ from threading import Thread
from Queue import Queue, Empty from Queue import Queue, Empty
from calibre import browser from calibre import browser
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote, XHTML_MIME
from calibre.ebooks.oeb.polish.container import OEB_FONTS from calibre.ebooks.oeb.polish.container import OEB_FONTS
from calibre.ebooks.oeb.polish.parsing import parse_html5
from calibre.ebooks.oeb.polish.replace import remove_links_to from calibre.ebooks.oeb.polish.replace import remove_links_to
from calibre.ebooks.oeb.polish.cover import get_raster_cover_name from calibre.ebooks.oeb.polish.cover import get_raster_cover_name
from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name
@ -388,7 +389,16 @@ def check_links(container):
return errors return errors
def check_external_links(container, progress_callback=lambda num, total:None): def get_html_ids(raw_data):
ans = set()
root = parse_html5(raw_data, discard_namespaces=True, line_numbers=False, fix_newlines=False)
for body in root.xpath('//body'):
ans.update(set(body.xpath('descendant-or-self::*/@id')))
ans.update(set(body.xpath('descendant::a/@name')))
return ans
def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True):
progress_callback(0, 0) progress_callback(0, 0)
external_links = defaultdict(list) external_links = defaultdict(list)
for name, mt in container.mime_map.iteritems(): for name, mt in container.mime_map.iteritems():
@ -396,8 +406,7 @@ def check_external_links(container, progress_callback=lambda num, total:None):
for href, lnum, col in container.iterlinks(name): for href, lnum, col in container.iterlinks(name):
purl = urlparse(href) purl = urlparse(href)
if purl.scheme in ('http', 'https'): if purl.scheme in ('http', 'https'):
key = href.partition('#')[0] external_links[href].append((name, href, lnum, col))
external_links[key].append((name, href, lnum, col))
if not external_links: if not external_links:
return [] return []
items = Queue() items = Queue()
@ -405,18 +414,33 @@ def check_external_links(container, progress_callback=lambda num, total:None):
tuple(map(items.put, external_links.iteritems())) tuple(map(items.put, external_links.iteritems()))
progress_callback(0, len(external_links)) progress_callback(0, len(external_links))
done = [] done = []
downloaded_html_ids = {}
def check_links(): def check_links():
br = browser(honor_time=False, verify_ssl_certificates=False) br = browser(honor_time=False, verify_ssl_certificates=False)
while True: while True:
try: try:
href, locations = items.get_nowait() full_href, locations = items.get_nowait()
except Empty: except Empty:
return return
href, frag = full_href.partition('#')[::2]
try: try:
br.open(href, timeout=10).close() res = br.open(href, timeout=10)
except Exception as e: except Exception as e:
ans.append((locations, e, href)) ans.append((locations, e, full_href))
else:
if frag and check_anchors:
ct = res.info().get('Content-Type')
if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}:
ids = downloaded_html_ids.get(href)
if ids is None:
try:
ids = downloaded_html_ids[href] = get_html_ids(res.read())
except Exception:
ids = downloaded_html_ids[href] = frozenset()
if frag not in ids:
ans.append((locations, ValueError('HTML anchor {} not found on the page'.format(frag)), full_href))
res.close()
finally: finally:
done.append(None) done.append(None)
progress_callback(len(done), len(external_links)) progress_callback(len(done), len(external_links))

View File

@ -77,6 +77,7 @@ d['insert_full_screen_image'] = False
d['preserve_aspect_ratio_when_inserting_image'] = False d['preserve_aspect_ratio_when_inserting_image'] = False
d['file_list_shows_full_pathname'] = False d['file_list_shows_full_pathname'] = False
d['auto_link_stylesheets'] = True d['auto_link_stylesheets'] = True
d['check_external_link_anchors'] = True
del d del d
ucase_map = {l:string.ascii_uppercase[i] for i, l in enumerate(string.ascii_lowercase)} ucase_map = {l:string.ascii_uppercase[i] for i, l in enumerate(string.ascii_lowercase)}

View File

@ -2,19 +2,18 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (unicode_literals, division, absolute_import, from __future__ import absolute_import, division, print_function, unicode_literals
print_function)
from collections import defaultdict from collections import defaultdict
from threading import Thread from threading import Thread
from PyQt5.Qt import ( from PyQt5.Qt import (
QVBoxLayout, QTextBrowser, QProgressBar, Qt, QWidget, QStackedWidget, QCheckBox, QHBoxLayout, QIcon, QInputDialog, QLabel, QProgressBar, QSizePolicy,
QLabel, QSizePolicy, pyqtSignal, QIcon, QInputDialog QStackedWidget, Qt, QTextBrowser, QVBoxLayout, QWidget, pyqtSignal
) )
from calibre.gui2 import error_dialog from calibre.gui2 import error_dialog
from calibre.gui2.tweak_book import current_container, set_current_container, editors from calibre.gui2.tweak_book import current_container, editors, set_current_container, tprefs
from calibre.gui2.tweak_book.boss import get_boss from calibre.gui2.tweak_book.boss import get_boss
from calibre.gui2.tweak_book.widgets import Dialog from calibre.gui2.tweak_book.widgets import Dialog
@ -74,12 +73,22 @@ class CheckExternalLinks(Dialog):
self.stack = s = QStackedWidget(self) self.stack = s = QStackedWidget(self)
s.addWidget(w), s.addWidget(self.results) s.addWidget(w), s.addWidget(self.results)
l.addWidget(s) l.addWidget(s)
l.addWidget(self.bb) self.bh = h = QHBoxLayout()
self.check_anchors = ca = QCheckBox(_('Check &anchors'))
ca.setToolTip(_('Check HTML anchors in links (the part after the #).\n'
' This can be a little slow, since it requires downloading and parsing all the HTML pages.'))
ca.setChecked(tprefs.get('check_external_link_anchors', True))
ca.stateChanged.connect(self.anchors_changed)
h.addWidget(ca), h.addStretch(100), h.addWidget(self.bb)
l.addLayout(h)
self.bb.setStandardButtons(self.bb.Close) self.bb.setStandardButtons(self.bb.Close)
self.rb = b = self.bb.addButton(_('&Refresh'), self.bb.ActionRole) self.rb = b = self.bb.addButton(_('&Refresh'), self.bb.ActionRole)
b.setIcon(QIcon(I('view-refresh.png'))) b.setIcon(QIcon(I('view-refresh.png')))
b.clicked.connect(self.refresh) b.clicked.connect(self.refresh)
def anchors_changed(self):
tprefs.set('check_external_link_anchors', self.check_anchors.isChecked())
def sizeHint(self): def sizeHint(self):
ans = Dialog.sizeHint(self) ans = Dialog.sizeHint(self)
ans.setHeight(600) ans.setHeight(600)
@ -91,7 +100,7 @@ class CheckExternalLinks(Dialog):
self.tb = None self.tb = None
self.errors = [] self.errors = []
try: try:
self.errors = check_external_links(current_container(), self.progress_made.emit) self.errors = check_external_links(current_container(), self.progress_made.emit, check_anchors=self.check_anchors.isChecked())
except Exception: except Exception:
import traceback import traceback
self.tb = traceback.format_exc() self.tb = traceback.format_exc()