Fetch News: Add API to allow recipe writers to easily resolve internal links to point to the downloaded versions of articles

This commit is contained in:
Kovid Goyal 2015-04-11 12:40:29 +05:30
parent 6d58813c65
commit 16efc963e3
3 changed files with 52 additions and 4 deletions

View File

@ -130,6 +130,7 @@ class RecipeInput(InputFormatPlugin):
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
if self.recipe_object is not None: if self.recipe_object is not None:
self.recipe_object.internal_postprocess_book(oeb, opts, log)
self.recipe_object.postprocess_book(oeb, opts, log) self.recipe_object.postprocess_book(oeb, opts, log)
def specialize(self, oeb, opts, log, output_fmt): def specialize(self, oeb, opts, log, output_fmt):

View File

@ -26,6 +26,7 @@ from calibre.web.fetch.simple import option_parser as web2disk_option_parser, Re
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf from calibre.utils.date import now as nowf
from calibre.utils.icu import numeric_sort_key
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import ThreadSafeWrapper from calibre.utils.logging import ThreadSafeWrapper
@ -382,7 +383,13 @@ class BasicNewsRecipe(Recipe):
#: assigned (default None). #: assigned (default None).
scale_news_images = None scale_news_images = None
# See the built-in profiles for examples of these settings. #: If set to True then links in downloaded articles that point to other downloaded articles are
#: changed to point to the downloaded copy of the article rather than its original web URL. If you
#: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work
#: with the URL scheme of your particular website.
resolve_internal_links = False
# See the built-in recipes for examples of these settings.
def short_title(self): def short_title(self):
return self.title return self.title
@ -645,6 +652,25 @@ class BasicNewsRecipe(Recipe):
''' '''
pass pass
def canonicalize_internal_url(self, url, is_link=True):
'''
Return a set of canonical representations of ``url``. The default
implementation uses just the server hostname and path of the URL,
ignoring any query parameters, fragments, etc. The canonical
representations must be unique across all URLs for this news source. If
they are not, then internal links may be resolved incorrectly.
:param is_link: Is True if the URL is coming from an internal link in
an HTML file. False if the URL is the URL used to
download an article.
'''
try:
parts = urlparse.urlparse(url)
except Exception:
self.log.error('Failed to parse url: %r, ignoring' % url)
return frozenset()
return frozenset([(parts.netloc, parts.path)])
def index_to_soup(self, url_or_raw, raw=False, as_tree=False): def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
''' '''
Convenience method that takes an URL to the index page and returns Convenience method that takes an URL to the index page and returns
@ -1479,6 +1505,8 @@ class BasicNewsRecipe(Recipe):
self.play_order_counter = 0 self.play_order_counter = 0
self.play_order_map = {} self.play_order_map = {}
self.article_url_map = aumap = defaultdict(set)
def feed_index(num, parent): def feed_index(num, parent):
f = feeds[num] f = feeds[num]
for j, a in enumerate(f): for j, a in enumerate(f):
@ -1498,7 +1526,10 @@ class BasicNewsRecipe(Recipe):
if po is None: if po is None:
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, arelpath = '%sindex.html'%adir
for curl in self.canonicalize_internal_url(a.orig_url, is_link=False):
aumap[curl].add(arelpath)
parent.add_item(arelpath, None,
a.title if a.title else _('Untitled Article'), a.title if a.title else _('Untitled Article'),
play_order=po, author=auth, play_order=po, author=auth,
description=desc, toc_thumbnail=tt) description=desc, toc_thumbnail=tt)
@ -1702,6 +1733,22 @@ class BasicNewsRecipe(Recipe):
divtag.append(brtag) divtag.append(brtag)
return soup return soup
def internal_postprocess_book(self, oeb, opts, log):
if self.resolve_internal_links and self.article_url_map:
seen = set()
for item in oeb.spine:
for a in item.data.xpath('//*[local-name()="a" and @href]'):
if a.get('rel') == 'calibre-downloaded-from':
continue
url = a.get('href')
for curl in self.canonicalize_internal_url(url):
articles = self.article_url_map.get(curl)
if articles:
arelpath = sorted(articles, key=numeric_sort_key)[0]
a.set('href', item.relhref(arelpath))
if url not in seen:
log.debug('Resolved internal URL: %s -> %s' % (url, arelpath))
seen.add(url)
class CustomIndexRecipe(BasicNewsRecipe): class CustomIndexRecipe(BasicNewsRecipe):

View File

@ -204,7 +204,7 @@ class NavBarTemplate(Template):
if not url.startswith('file://'): if not url.startswith('file://'):
navbar.append(HR()) navbar.append(HR())
text = 'This article was downloaded by ' text = 'This article was downloaded by '
p = PT(text, STRONG(__appname__), A(url, href=url), p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
style='text-align:left; max-width: 100%; overflow: hidden;') style='text-align:left; max-width: 100%; overflow: hidden;')
p[0].tail = ' from ' p[0].tail = ' from '
navbar.append(p) navbar.append(p)
@ -390,7 +390,7 @@ class TouchscreenNavBarTemplate(Template):
if bottom and not url.startswith('file://'): if bottom and not url.startswith('file://'):
navbar.append(HR()) navbar.append(HR())
text = 'This article was downloaded by ' text = 'This article was downloaded by '
p = PT(text, STRONG(__appname__), A(url, href=url), p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
style='text-align:left; max-width: 100%; overflow: hidden;') style='text-align:left; max-width: 100%; overflow: hidden;')
p[0].tail = ' from ' p[0].tail = ' from '
navbar.append(p) navbar.append(p)