mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fetch News: Add API to allow recipe writers to easily resolve internal links to point to the downloaded versions of articles
This commit is contained in:
parent
6d58813c65
commit
16efc963e3
@ -130,6 +130,7 @@ class RecipeInput(InputFormatPlugin):
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
if self.recipe_object is not None:
|
||||
self.recipe_object.internal_postprocess_book(oeb, opts, log)
|
||||
self.recipe_object.postprocess_book(oeb, opts, log)
|
||||
|
||||
def specialize(self, oeb, opts, log, output_fmt):
|
||||
|
@ -26,6 +26,7 @@ from calibre.web.fetch.simple import option_parser as web2disk_option_parser, Re
|
||||
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.utils.date import now as nowf
|
||||
from calibre.utils.icu import numeric_sort_key
|
||||
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
from calibre.utils.logging import ThreadSafeWrapper
|
||||
@ -382,7 +383,13 @@ class BasicNewsRecipe(Recipe):
|
||||
#: assigned (default None).
|
||||
scale_news_images = None
|
||||
|
||||
# See the built-in profiles for examples of these settings.
|
||||
#: If set to True then links in downloaded articles that point to other downloaded articles are
|
||||
#: changed to point to the downloaded copy of the article rather than its original web URL. If you
|
||||
#: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work
|
||||
#: with the URL scheme of your particular website.
|
||||
resolve_internal_links = False
|
||||
|
||||
# See the built-in recipes for examples of these settings.
|
||||
|
||||
def short_title(self):
|
||||
return self.title
|
||||
@ -645,6 +652,25 @@ class BasicNewsRecipe(Recipe):
|
||||
'''
|
||||
pass
|
||||
|
||||
def canonicalize_internal_url(self, url, is_link=True):
|
||||
'''
|
||||
Return a set of canonical representations of ``url``. The default
|
||||
implementation uses just the server hostname and path of the URL,
|
||||
ignoring any query parameters, fragments, etc. The canonical
|
||||
representations must be unique across all URLs for this news source. If
|
||||
they are not, then internal links may be resolved incorrectly.
|
||||
|
||||
:param is_link: Is True if the URL is coming from an internal link in
|
||||
an HTML file. False if the URL is the URL used to
|
||||
download an article.
|
||||
'''
|
||||
try:
|
||||
parts = urlparse.urlparse(url)
|
||||
except Exception:
|
||||
self.log.error('Failed to parse url: %r, ignoring' % url)
|
||||
return frozenset()
|
||||
return frozenset([(parts.netloc, parts.path)])
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
|
||||
'''
|
||||
Convenience method that takes an URL to the index page and returns
|
||||
@ -1479,6 +1505,8 @@ class BasicNewsRecipe(Recipe):
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
self.article_url_map = aumap = defaultdict(set)
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
@ -1498,7 +1526,10 @@ class BasicNewsRecipe(Recipe):
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
arelpath = '%sindex.html'%adir
|
||||
for curl in self.canonicalize_internal_url(a.orig_url, is_link=False):
|
||||
aumap[curl].add(arelpath)
|
||||
parent.add_item(arelpath, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
@ -1702,6 +1733,22 @@ class BasicNewsRecipe(Recipe):
|
||||
divtag.append(brtag)
|
||||
return soup
|
||||
|
||||
def internal_postprocess_book(self, oeb, opts, log):
|
||||
if self.resolve_internal_links and self.article_url_map:
|
||||
seen = set()
|
||||
for item in oeb.spine:
|
||||
for a in item.data.xpath('//*[local-name()="a" and @href]'):
|
||||
if a.get('rel') == 'calibre-downloaded-from':
|
||||
continue
|
||||
url = a.get('href')
|
||||
for curl in self.canonicalize_internal_url(url):
|
||||
articles = self.article_url_map.get(curl)
|
||||
if articles:
|
||||
arelpath = sorted(articles, key=numeric_sort_key)[0]
|
||||
a.set('href', item.relhref(arelpath))
|
||||
if url not in seen:
|
||||
log.debug('Resolved internal URL: %s -> %s' % (url, arelpath))
|
||||
seen.add(url)
|
||||
|
||||
class CustomIndexRecipe(BasicNewsRecipe):
|
||||
|
||||
|
@ -204,7 +204,7 @@ class NavBarTemplate(Template):
|
||||
if not url.startswith('file://'):
|
||||
navbar.append(HR())
|
||||
text = 'This article was downloaded by '
|
||||
p = PT(text, STRONG(__appname__), A(url, href=url),
|
||||
p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
|
||||
style='text-align:left; max-width: 100%; overflow: hidden;')
|
||||
p[0].tail = ' from '
|
||||
navbar.append(p)
|
||||
@ -390,7 +390,7 @@ class TouchscreenNavBarTemplate(Template):
|
||||
if bottom and not url.startswith('file://'):
|
||||
navbar.append(HR())
|
||||
text = 'This article was downloaded by '
|
||||
p = PT(text, STRONG(__appname__), A(url, href=url),
|
||||
p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
|
||||
style='text-align:left; max-width: 100%; overflow: hidden;')
|
||||
p[0].tail = ' from '
|
||||
navbar.append(p)
|
||||
|
Loading…
x
Reference in New Issue
Block a user