mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Check Book: External link checker: Also check HTML anchors (the part after the # in the link). Can be turned off via a checkbox at the bottom of the link checker window.
This commit is contained in:
parent
04ba64d0d7
commit
c03377adc4
@ -14,8 +14,9 @@ from threading import Thread
|
|||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
|
|
||||||
from calibre import browser
|
from calibre import browser
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote
|
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, urlunquote, XHTML_MIME
|
||||||
from calibre.ebooks.oeb.polish.container import OEB_FONTS
|
from calibre.ebooks.oeb.polish.container import OEB_FONTS
|
||||||
|
from calibre.ebooks.oeb.polish.parsing import parse_html5
|
||||||
from calibre.ebooks.oeb.polish.replace import remove_links_to
|
from calibre.ebooks.oeb.polish.replace import remove_links_to
|
||||||
from calibre.ebooks.oeb.polish.cover import get_raster_cover_name
|
from calibre.ebooks.oeb.polish.cover import get_raster_cover_name
|
||||||
from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name
|
from calibre.ebooks.oeb.polish.utils import guess_type, actual_case_for_name, corrected_case_for_name
|
||||||
@ -388,7 +389,16 @@ def check_links(container):
|
|||||||
return errors
|
return errors
|
||||||
|
|
||||||
|
|
||||||
def check_external_links(container, progress_callback=lambda num, total:None):
|
def get_html_ids(raw_data):
|
||||||
|
ans = set()
|
||||||
|
root = parse_html5(raw_data, discard_namespaces=True, line_numbers=False, fix_newlines=False)
|
||||||
|
for body in root.xpath('//body'):
|
||||||
|
ans.update(set(body.xpath('descendant-or-self::*/@id')))
|
||||||
|
ans.update(set(body.xpath('descendant::a/@name')))
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def check_external_links(container, progress_callback=(lambda num, total:None), check_anchors=True):
|
||||||
progress_callback(0, 0)
|
progress_callback(0, 0)
|
||||||
external_links = defaultdict(list)
|
external_links = defaultdict(list)
|
||||||
for name, mt in container.mime_map.iteritems():
|
for name, mt in container.mime_map.iteritems():
|
||||||
@ -396,8 +406,7 @@ def check_external_links(container, progress_callback=lambda num, total:None):
|
|||||||
for href, lnum, col in container.iterlinks(name):
|
for href, lnum, col in container.iterlinks(name):
|
||||||
purl = urlparse(href)
|
purl = urlparse(href)
|
||||||
if purl.scheme in ('http', 'https'):
|
if purl.scheme in ('http', 'https'):
|
||||||
key = href.partition('#')[0]
|
external_links[href].append((name, href, lnum, col))
|
||||||
external_links[key].append((name, href, lnum, col))
|
|
||||||
if not external_links:
|
if not external_links:
|
||||||
return []
|
return []
|
||||||
items = Queue()
|
items = Queue()
|
||||||
@ -405,18 +414,33 @@ def check_external_links(container, progress_callback=lambda num, total:None):
|
|||||||
tuple(map(items.put, external_links.iteritems()))
|
tuple(map(items.put, external_links.iteritems()))
|
||||||
progress_callback(0, len(external_links))
|
progress_callback(0, len(external_links))
|
||||||
done = []
|
done = []
|
||||||
|
downloaded_html_ids = {}
|
||||||
|
|
||||||
def check_links():
|
def check_links():
|
||||||
br = browser(honor_time=False, verify_ssl_certificates=False)
|
br = browser(honor_time=False, verify_ssl_certificates=False)
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
href, locations = items.get_nowait()
|
full_href, locations = items.get_nowait()
|
||||||
except Empty:
|
except Empty:
|
||||||
return
|
return
|
||||||
|
href, frag = full_href.partition('#')[::2]
|
||||||
try:
|
try:
|
||||||
br.open(href, timeout=10).close()
|
res = br.open(href, timeout=10)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
ans.append((locations, e, href))
|
ans.append((locations, e, full_href))
|
||||||
|
else:
|
||||||
|
if frag and check_anchors:
|
||||||
|
ct = res.info().get('Content-Type')
|
||||||
|
if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}:
|
||||||
|
ids = downloaded_html_ids.get(href)
|
||||||
|
if ids is None:
|
||||||
|
try:
|
||||||
|
ids = downloaded_html_ids[href] = get_html_ids(res.read())
|
||||||
|
except Exception:
|
||||||
|
ids = downloaded_html_ids[href] = frozenset()
|
||||||
|
if frag not in ids:
|
||||||
|
ans.append((locations, ValueError('HTML anchor {} not found on the page'.format(frag)), full_href))
|
||||||
|
res.close()
|
||||||
finally:
|
finally:
|
||||||
done.append(None)
|
done.append(None)
|
||||||
progress_callback(len(done), len(external_links))
|
progress_callback(len(done), len(external_links))
|
||||||
|
@ -77,6 +77,7 @@ d['insert_full_screen_image'] = False
|
|||||||
d['preserve_aspect_ratio_when_inserting_image'] = False
|
d['preserve_aspect_ratio_when_inserting_image'] = False
|
||||||
d['file_list_shows_full_pathname'] = False
|
d['file_list_shows_full_pathname'] = False
|
||||||
d['auto_link_stylesheets'] = True
|
d['auto_link_stylesheets'] = True
|
||||||
|
d['check_external_link_anchors'] = True
|
||||||
del d
|
del d
|
||||||
|
|
||||||
ucase_map = {l:string.ascii_uppercase[i] for i, l in enumerate(string.ascii_lowercase)}
|
ucase_map = {l:string.ascii_uppercase[i] for i, l in enumerate(string.ascii_lowercase)}
|
||||||
|
@ -2,19 +2,18 @@
|
|||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
from __future__ import (unicode_literals, division, absolute_import,
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
print_function)
|
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
from PyQt5.Qt import (
|
from PyQt5.Qt import (
|
||||||
QVBoxLayout, QTextBrowser, QProgressBar, Qt, QWidget, QStackedWidget,
|
QCheckBox, QHBoxLayout, QIcon, QInputDialog, QLabel, QProgressBar, QSizePolicy,
|
||||||
QLabel, QSizePolicy, pyqtSignal, QIcon, QInputDialog
|
QStackedWidget, Qt, QTextBrowser, QVBoxLayout, QWidget, pyqtSignal
|
||||||
)
|
)
|
||||||
|
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog
|
||||||
from calibre.gui2.tweak_book import current_container, set_current_container, editors
|
from calibre.gui2.tweak_book import current_container, editors, set_current_container, tprefs
|
||||||
from calibre.gui2.tweak_book.boss import get_boss
|
from calibre.gui2.tweak_book.boss import get_boss
|
||||||
from calibre.gui2.tweak_book.widgets import Dialog
|
from calibre.gui2.tweak_book.widgets import Dialog
|
||||||
|
|
||||||
@ -74,12 +73,22 @@ class CheckExternalLinks(Dialog):
|
|||||||
self.stack = s = QStackedWidget(self)
|
self.stack = s = QStackedWidget(self)
|
||||||
s.addWidget(w), s.addWidget(self.results)
|
s.addWidget(w), s.addWidget(self.results)
|
||||||
l.addWidget(s)
|
l.addWidget(s)
|
||||||
l.addWidget(self.bb)
|
self.bh = h = QHBoxLayout()
|
||||||
|
self.check_anchors = ca = QCheckBox(_('Check &anchors'))
|
||||||
|
ca.setToolTip(_('Check HTML anchors in links (the part after the #).\n'
|
||||||
|
' This can be a little slow, since it requires downloading and parsing all the HTML pages.'))
|
||||||
|
ca.setChecked(tprefs.get('check_external_link_anchors', True))
|
||||||
|
ca.stateChanged.connect(self.anchors_changed)
|
||||||
|
h.addWidget(ca), h.addStretch(100), h.addWidget(self.bb)
|
||||||
|
l.addLayout(h)
|
||||||
self.bb.setStandardButtons(self.bb.Close)
|
self.bb.setStandardButtons(self.bb.Close)
|
||||||
self.rb = b = self.bb.addButton(_('&Refresh'), self.bb.ActionRole)
|
self.rb = b = self.bb.addButton(_('&Refresh'), self.bb.ActionRole)
|
||||||
b.setIcon(QIcon(I('view-refresh.png')))
|
b.setIcon(QIcon(I('view-refresh.png')))
|
||||||
b.clicked.connect(self.refresh)
|
b.clicked.connect(self.refresh)
|
||||||
|
|
||||||
|
def anchors_changed(self):
|
||||||
|
tprefs.set('check_external_link_anchors', self.check_anchors.isChecked())
|
||||||
|
|
||||||
def sizeHint(self):
|
def sizeHint(self):
|
||||||
ans = Dialog.sizeHint(self)
|
ans = Dialog.sizeHint(self)
|
||||||
ans.setHeight(600)
|
ans.setHeight(600)
|
||||||
@ -91,7 +100,7 @@ class CheckExternalLinks(Dialog):
|
|||||||
self.tb = None
|
self.tb = None
|
||||||
self.errors = []
|
self.errors = []
|
||||||
try:
|
try:
|
||||||
self.errors = check_external_links(current_container(), self.progress_made.emit)
|
self.errors = check_external_links(current_container(), self.progress_made.emit, check_anchors=self.check_anchors.isChecked())
|
||||||
except Exception:
|
except Exception:
|
||||||
import traceback
|
import traceback
|
||||||
self.tb = traceback.format_exc()
|
self.tb = traceback.format_exc()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user