Fetch News: Add API to allow recipe writers to easily resolve internal links to point to the downloaded versions of articles

This commit is contained in:
Kovid Goyal 2015-04-11 12:40:29 +05:30
parent 6d58813c65
commit 16efc963e3
3 changed files with 52 additions and 4 deletions

View File

@ -130,6 +130,7 @@ class RecipeInput(InputFormatPlugin):
def postprocess_book(self, oeb, opts, log):
if self.recipe_object is not None:
self.recipe_object.internal_postprocess_book(oeb, opts, log)
self.recipe_object.postprocess_book(oeb, opts, log)
def specialize(self, oeb, opts, log, output_fmt):

View File

@ -26,6 +26,7 @@ from calibre.web.fetch.simple import option_parser as web2disk_option_parser, Re
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.icu import numeric_sort_key
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import ThreadSafeWrapper
@ -382,7 +383,13 @@ class BasicNewsRecipe(Recipe):
#: assigned (default None).
scale_news_images = None
# See the built-in profiles for examples of these settings.
#: If set to True then links in downloaded articles that point to other downloaded articles are
#: changed to point to the downloaded copy of the article rather than its original web URL. If you
#: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work
#: with the URL scheme of your particular website.
resolve_internal_links = False
# See the built-in recipes for examples of these settings.
def short_title(self):
return self.title
@ -645,6 +652,25 @@ class BasicNewsRecipe(Recipe):
'''
pass
def canonicalize_internal_url(self, url, is_link=True):
'''
Return a set of canonical representations of ``url``. The default
implementation uses just the server hostname and path of the URL,
ignoring any query parameters, fragments, etc. The canonical
representations must be unique across all URLs for this news source. If
they are not, then internal links may be resolved incorrectly.
:param is_link: Is True if the URL is coming from an internal link in
an HTML file. False if the URL is the URL used to
download an article.
'''
try:
parts = urlparse.urlparse(url)
except Exception:
self.log.error('Failed to parse url: %r, ignoring' % url)
return frozenset()
return frozenset([(parts.netloc, parts.path)])
def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
'''
Convenience method that takes an URL to the index page and returns
@ -1479,6 +1505,8 @@ class BasicNewsRecipe(Recipe):
self.play_order_counter = 0
self.play_order_map = {}
self.article_url_map = aumap = defaultdict(set)
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
@ -1498,7 +1526,10 @@ class BasicNewsRecipe(Recipe):
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None,
arelpath = '%sindex.html'%adir
for curl in self.canonicalize_internal_url(a.orig_url, is_link=False):
aumap[curl].add(arelpath)
parent.add_item(arelpath, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
@ -1702,6 +1733,22 @@ class BasicNewsRecipe(Recipe):
divtag.append(brtag)
return soup
def internal_postprocess_book(self, oeb, opts, log):
if self.resolve_internal_links and self.article_url_map:
seen = set()
for item in oeb.spine:
for a in item.data.xpath('//*[local-name()="a" and @href]'):
if a.get('rel') == 'calibre-downloaded-from':
continue
url = a.get('href')
for curl in self.canonicalize_internal_url(url):
articles = self.article_url_map.get(curl)
if articles:
arelpath = sorted(articles, key=numeric_sort_key)[0]
a.set('href', item.relhref(arelpath))
if url not in seen:
log.debug('Resolved internal URL: %s -> %s' % (url, arelpath))
seen.add(url)
class CustomIndexRecipe(BasicNewsRecipe):

View File

@ -204,7 +204,7 @@ class NavBarTemplate(Template):
if not url.startswith('file://'):
navbar.append(HR())
text = 'This article was downloaded by '
p = PT(text, STRONG(__appname__), A(url, href=url),
p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
style='text-align:left; max-width: 100%; overflow: hidden;')
p[0].tail = ' from '
navbar.append(p)
@ -390,7 +390,7 @@ class TouchscreenNavBarTemplate(Template):
if bottom and not url.startswith('file://'):
navbar.append(HR())
text = 'This article was downloaded by '
p = PT(text, STRONG(__appname__), A(url, href=url),
p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
style='text-align:left; max-width: 100%; overflow: hidden;')
p[0].tail = ' from '
navbar.append(p)