mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fetch News: Add API to allow recipe writers to easily resolve internal links to point to the downloaded versions of articles
This commit is contained in:
parent
6d58813c65
commit
16efc963e3
@ -130,6 +130,7 @@ class RecipeInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log):
|
def postprocess_book(self, oeb, opts, log):
|
||||||
if self.recipe_object is not None:
|
if self.recipe_object is not None:
|
||||||
|
self.recipe_object.internal_postprocess_book(oeb, opts, log)
|
||||||
self.recipe_object.postprocess_book(oeb, opts, log)
|
self.recipe_object.postprocess_book(oeb, opts, log)
|
||||||
|
|
||||||
def specialize(self, oeb, opts, log, output_fmt):
|
def specialize(self, oeb, opts, log, output_fmt):
|
||||||
|
@ -26,6 +26,7 @@ from calibre.web.fetch.simple import option_parser as web2disk_option_parser, Re
|
|||||||
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre.utils.date import now as nowf
|
from calibre.utils.date import now as nowf
|
||||||
|
from calibre.utils.icu import numeric_sort_key
|
||||||
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
|
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
from calibre.utils.logging import ThreadSafeWrapper
|
from calibre.utils.logging import ThreadSafeWrapper
|
||||||
@ -382,7 +383,13 @@ class BasicNewsRecipe(Recipe):
|
|||||||
#: assigned (default None).
|
#: assigned (default None).
|
||||||
scale_news_images = None
|
scale_news_images = None
|
||||||
|
|
||||||
# See the built-in profiles for examples of these settings.
|
#: If set to True then links in downloaded articles that point to other downloaded articles are
|
||||||
|
#: changed to point to the downloaded copy of the article rather than its original web URL. If you
|
||||||
|
#: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work
|
||||||
|
#: with the URL scheme of your particular website.
|
||||||
|
resolve_internal_links = False
|
||||||
|
|
||||||
|
# See the built-in recipes for examples of these settings.
|
||||||
|
|
||||||
def short_title(self):
|
def short_title(self):
|
||||||
return self.title
|
return self.title
|
||||||
@ -645,6 +652,25 @@ class BasicNewsRecipe(Recipe):
|
|||||||
'''
|
'''
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def canonicalize_internal_url(self, url, is_link=True):
|
||||||
|
'''
|
||||||
|
Return a set of canonical representations of ``url``. The default
|
||||||
|
implementation uses just the server hostname and path of the URL,
|
||||||
|
ignoring any query parameters, fragments, etc. The canonical
|
||||||
|
representations must be unique across all URLs for this news source. If
|
||||||
|
they are not, then internal links may be resolved incorrectly.
|
||||||
|
|
||||||
|
:param is_link: Is True if the URL is coming from an internal link in
|
||||||
|
an HTML file. False if the URL is the URL used to
|
||||||
|
download an article.
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
parts = urlparse.urlparse(url)
|
||||||
|
except Exception:
|
||||||
|
self.log.error('Failed to parse url: %r, ignoring' % url)
|
||||||
|
return frozenset()
|
||||||
|
return frozenset([(parts.netloc, parts.path)])
|
||||||
|
|
||||||
def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
|
def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
|
||||||
'''
|
'''
|
||||||
Convenience method that takes an URL to the index page and returns
|
Convenience method that takes an URL to the index page and returns
|
||||||
@ -1479,6 +1505,8 @@ class BasicNewsRecipe(Recipe):
|
|||||||
self.play_order_counter = 0
|
self.play_order_counter = 0
|
||||||
self.play_order_map = {}
|
self.play_order_map = {}
|
||||||
|
|
||||||
|
self.article_url_map = aumap = defaultdict(set)
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
for j, a in enumerate(f):
|
for j, a in enumerate(f):
|
||||||
@ -1498,7 +1526,10 @@ class BasicNewsRecipe(Recipe):
|
|||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None,
|
arelpath = '%sindex.html'%adir
|
||||||
|
for curl in self.canonicalize_internal_url(a.orig_url, is_link=False):
|
||||||
|
aumap[curl].add(arelpath)
|
||||||
|
parent.add_item(arelpath, None,
|
||||||
a.title if a.title else _('Untitled Article'),
|
a.title if a.title else _('Untitled Article'),
|
||||||
play_order=po, author=auth,
|
play_order=po, author=auth,
|
||||||
description=desc, toc_thumbnail=tt)
|
description=desc, toc_thumbnail=tt)
|
||||||
@ -1702,6 +1733,22 @@ class BasicNewsRecipe(Recipe):
|
|||||||
divtag.append(brtag)
|
divtag.append(brtag)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def internal_postprocess_book(self, oeb, opts, log):
|
||||||
|
if self.resolve_internal_links and self.article_url_map:
|
||||||
|
seen = set()
|
||||||
|
for item in oeb.spine:
|
||||||
|
for a in item.data.xpath('//*[local-name()="a" and @href]'):
|
||||||
|
if a.get('rel') == 'calibre-downloaded-from':
|
||||||
|
continue
|
||||||
|
url = a.get('href')
|
||||||
|
for curl in self.canonicalize_internal_url(url):
|
||||||
|
articles = self.article_url_map.get(curl)
|
||||||
|
if articles:
|
||||||
|
arelpath = sorted(articles, key=numeric_sort_key)[0]
|
||||||
|
a.set('href', item.relhref(arelpath))
|
||||||
|
if url not in seen:
|
||||||
|
log.debug('Resolved internal URL: %s -> %s' % (url, arelpath))
|
||||||
|
seen.add(url)
|
||||||
|
|
||||||
class CustomIndexRecipe(BasicNewsRecipe):
|
class CustomIndexRecipe(BasicNewsRecipe):
|
||||||
|
|
||||||
|
@ -204,7 +204,7 @@ class NavBarTemplate(Template):
|
|||||||
if not url.startswith('file://'):
|
if not url.startswith('file://'):
|
||||||
navbar.append(HR())
|
navbar.append(HR())
|
||||||
text = 'This article was downloaded by '
|
text = 'This article was downloaded by '
|
||||||
p = PT(text, STRONG(__appname__), A(url, href=url),
|
p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
|
||||||
style='text-align:left; max-width: 100%; overflow: hidden;')
|
style='text-align:left; max-width: 100%; overflow: hidden;')
|
||||||
p[0].tail = ' from '
|
p[0].tail = ' from '
|
||||||
navbar.append(p)
|
navbar.append(p)
|
||||||
@ -390,7 +390,7 @@ class TouchscreenNavBarTemplate(Template):
|
|||||||
if bottom and not url.startswith('file://'):
|
if bottom and not url.startswith('file://'):
|
||||||
navbar.append(HR())
|
navbar.append(HR())
|
||||||
text = 'This article was downloaded by '
|
text = 'This article was downloaded by '
|
||||||
p = PT(text, STRONG(__appname__), A(url, href=url),
|
p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'),
|
||||||
style='text-align:left; max-width: 100%; overflow: hidden;')
|
style='text-align:left; max-width: 100%; overflow: hidden;')
|
||||||
p[0].tail = ' from '
|
p[0].tail = ' from '
|
||||||
navbar.append(p)
|
navbar.append(p)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user