diff --git a/src/calibre/ebooks/oeb/polish/check/links.py b/src/calibre/ebooks/oeb/polish/check/links.py index ffc523b804..b3ed4842ea 100644 --- a/src/calibre/ebooks/oeb/polish/check/links.py +++ b/src/calibre/ebooks/oeb/polish/check/links.py @@ -14,8 +14,9 @@ from threading import Thread from Queue import Queue, Empty from calibre import browser -from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote +from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote, XHTML_MIME from calibre.ebooks.oeb.polish.container import OEB_FONTS +from calibre.ebooks.oeb.polish.parsing import parse_html5 from calibre.ebooks.oeb.polish.replace import remove_links_to from calibre.ebooks.oeb.polish.cover import get_raster_cover_name from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name @@ -388,7 +389,16 @@ def check_links(container): return errors -def check_external_links(container, progress_callback=lambda num, total:None): +def get_html_ids(raw_data): + ans = set() + root = parse_html5(raw_data, discard_namespaces=True, line_numbers=False, fix_newlines=False) + for body in root.xpath('//body'): + ans.update(set(body.xpath('descendant-or-self::*/@id'))) + ans.update(set(body.xpath('descendant::a/@name'))) + return ans + + +def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True): progress_callback(0, 0) external_links = defaultdict(list) for name, mt in container.mime_map.iteritems(): @@ -396,8 +406,7 @@ def check_external_links(container, progress_callback=lambda num, total:None): for href, lnum, col in container.iterlinks(name): purl = urlparse(href) if purl.scheme in ('http', 'https'): - key = href.partition('#')[0] - external_links[key].append((name, href, lnum, col)) + external_links[href].append((name, href, lnum, col)) if not external_links: return [] items = Queue() @@ -405,18 +414,33 @@ def check_external_links(container, progress_callback=lambda num, total:None): tuple(map(items.put, external_links.iteritems())) progress_callback(0, len(external_links)) done = [] + downloaded_html_ids = {} def check_links(): br = browser(honor_time=False, verify_ssl_certificates=False) while True: try: - href, locations = items.get_nowait() + full_href, locations = items.get_nowait() except Empty: return + href, frag = full_href.partition('#')[::2] try: - br.open(href, timeout=10).close() + res = br.open(href, timeout=10) except Exception as e: - ans.append((locations, e, href)) + ans.append((locations, e, full_href)) + else: + if frag and check_anchors: + ct = res.info().get('Content-Type') + if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}: + ids = downloaded_html_ids.get(href) + if ids is None: + try: + ids = downloaded_html_ids[href] = get_html_ids(res.read()) + except Exception: + ids = downloaded_html_ids[href] = frozenset() + if frag not in ids: + ans.append((locations, ValueError('HTML anchor {} not found on the page'.format(frag)), full_href)) + res.close() finally: done.append(None) progress_callback(len(done), len(external_links)) diff --git a/src/calibre/gui2/tweak_book/__init__.py b/src/calibre/gui2/tweak_book/__init__.py index d9dbe87476..b1b5499d92 100644 --- a/src/calibre/gui2/tweak_book/__init__.py +++ b/src/calibre/gui2/tweak_book/__init__.py @@ -77,6 +77,7 @@ d['insert_full_screen_image'] = False d['preserve_aspect_ratio_when_inserting_image'] = False d['file_list_shows_full_pathname'] = False d['auto_link_stylesheets'] = True +d['check_external_link_anchors'] = True del d ucase_map = {l:string.ascii_uppercase[i] for i, l in enumerate(string.ascii_lowercase)} diff --git a/src/calibre/gui2/tweak_book/check_links.py b/src/calibre/gui2/tweak_book/check_links.py index 94cd1bff67..481f2244e3 100644 --- a/src/calibre/gui2/tweak_book/check_links.py +++ b/src/calibre/gui2/tweak_book/check_links.py @@ -2,19 +2,18 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal -from __future__ import (unicode_literals, division, absolute_import, - print_function) +from __future__ import absolute_import, division, print_function, unicode_literals from collections import defaultdict from threading import Thread from PyQt5.Qt import ( - QVBoxLayout, QTextBrowser, QProgressBar, Qt, QWidget, QStackedWidget, - QLabel, QSizePolicy, pyqtSignal, QIcon, QInputDialog + QCheckBox, QHBoxLayout, QIcon, QInputDialog, QLabel, QProgressBar, QSizePolicy, + QStackedWidget, Qt, QTextBrowser, QVBoxLayout, QWidget, pyqtSignal ) from calibre.gui2 import error_dialog -from calibre.gui2.tweak_book import current_container, set_current_container, editors +from calibre.gui2.tweak_book import current_container, editors, set_current_container, tprefs from calibre.gui2.tweak_book.boss import get_boss from calibre.gui2.tweak_book.widgets import Dialog @@ -74,12 +73,22 @@ class CheckExternalLinks(Dialog): self.stack = s = QStackedWidget(self) s.addWidget(w), s.addWidget(self.results) l.addWidget(s) - l.addWidget(self.bb) + self.bh = h = QHBoxLayout() + self.check_anchors = ca = QCheckBox(_('Check &anchors')) + ca.setToolTip(_('Check HTML anchors in links (the part after the #).\n' + ' This can be a little slow, since it requires downloading and parsing all the HTML pages.')) + ca.setChecked(tprefs.get('check_external_link_anchors', True)) + ca.stateChanged.connect(self.anchors_changed) + h.addWidget(ca), h.addStretch(100), h.addWidget(self.bb) + l.addLayout(h) self.bb.setStandardButtons(self.bb.Close) self.rb = b = self.bb.addButton(_('&Refresh'), self.bb.ActionRole) b.setIcon(QIcon(I('view-refresh.png'))) b.clicked.connect(self.refresh) + def anchors_changed(self): + tprefs.set('check_external_link_anchors', self.check_anchors.isChecked()) + def sizeHint(self): ans = Dialog.sizeHint(self) ans.setHeight(600) @@ -91,7 +100,7 @@ class CheckExternalLinks(Dialog): self.tb = None self.errors = [] try: - self.errors = check_external_links(current_container(), self.progress_made.emit) + self.errors = check_external_links(current_container(), self.progress_made.emit, check_anchors=self.check_anchors.isChecked()) except Exception: import traceback self.tb = traceback.format_exc()