diff --git a/src/calibre/ebooks/conversion/plugins/recipe_input.py b/src/calibre/ebooks/conversion/plugins/recipe_input.py index feeafd6c28..27f4286cc2 100644 --- a/src/calibre/ebooks/conversion/plugins/recipe_input.py +++ b/src/calibre/ebooks/conversion/plugins/recipe_input.py @@ -130,6 +130,7 @@ class RecipeInput(InputFormatPlugin): def postprocess_book(self, oeb, opts, log): if self.recipe_object is not None: + self.recipe_object.internal_postprocess_book(oeb, opts, log) self.recipe_object.postprocess_book(oeb, opts, log) def specialize(self, oeb, opts, log, output_fmt): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 1b5aaeb19c..97229c7561 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -26,6 +26,7 @@ from calibre.web.fetch.simple import option_parser as web2disk_option_parser, Re from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf +from calibre.utils.icu import numeric_sort_key from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image from calibre.utils.localization import canonicalize_lang from calibre.utils.logging import ThreadSafeWrapper @@ -382,7 +383,13 @@ class BasicNewsRecipe(Recipe): #: assigned (default None). scale_news_images = None - # See the built-in profiles for examples of these settings. + #: If set to True then links in downloaded articles that point to other downloaded articles are + #: changed to point to the downloaded copy of the article rather than its original web URL. If you + #: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work + #: with the URL scheme of your particular website. + resolve_internal_links = False + + # See the built-in recipes for examples of these settings. def short_title(self): return self.title @@ -645,6 +652,25 @@ class BasicNewsRecipe(Recipe): ''' pass + def canonicalize_internal_url(self, url, is_link=True): + ''' + Return a set of canonical representations of ``url``. The default + implementation uses just the server hostname and path of the URL, + ignoring any query parameters, fragments, etc. The canonical + representations must be unique across all URLs for this news source. If + they are not, then internal links may be resolved incorrectly. + + :param is_link: Is True if the URL is coming from an internal link in + an HTML file. False if the URL is the URL used to + download an article. + ''' + try: + parts = urlparse.urlparse(url) + except Exception: + self.log.error('Failed to parse url: %r, ignoring' % url) + return frozenset() + return frozenset([(parts.netloc, parts.path)]) + def index_to_soup(self, url_or_raw, raw=False, as_tree=False): ''' Convenience method that takes an URL to the index page and returns @@ -1479,6 +1505,8 @@ class BasicNewsRecipe(Recipe): self.play_order_counter = 0 self.play_order_map = {} + self.article_url_map = aumap = defaultdict(set) + def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): @@ -1498,7 +1526,10 @@ class BasicNewsRecipe(Recipe): if po is None: self.play_order_counter += 1 po = self.play_order_counter - parent.add_item('%sindex.html'%adir, None, + arelpath = '%sindex.html'%adir + for curl in self.canonicalize_internal_url(a.orig_url, is_link=False): + aumap[curl].add(arelpath) + parent.add_item(arelpath, None, a.title if a.title else _('Untitled Article'), play_order=po, author=auth, description=desc, toc_thumbnail=tt) @@ -1702,6 +1733,22 @@ class BasicNewsRecipe(Recipe): divtag.append(brtag) return soup + def internal_postprocess_book(self, oeb, opts, log): + if self.resolve_internal_links and self.article_url_map: + seen = set() + for item in oeb.spine: + for a in item.data.xpath('//*[local-name()="a" and @href]'): + if a.get('rel') == 'calibre-downloaded-from': + continue + url = a.get('href') + for curl in self.canonicalize_internal_url(url): + articles = self.article_url_map.get(curl) + if articles: + arelpath = sorted(articles, key=numeric_sort_key)[0] + a.set('href', item.relhref(arelpath)) + if url not in seen: + log.debug('Resolved internal URL: %s -> %s' % (url, arelpath)) + seen.add(url) class CustomIndexRecipe(BasicNewsRecipe): diff --git a/src/calibre/web/feeds/templates.py b/src/calibre/web/feeds/templates.py index 840baa1d4e..ebe9ead97a 100644 --- a/src/calibre/web/feeds/templates.py +++ b/src/calibre/web/feeds/templates.py @@ -204,7 +204,7 @@ class NavBarTemplate(Template): if not url.startswith('file://'): navbar.append(HR()) text = 'This article was downloaded by ' - p = PT(text, STRONG(__appname__), A(url, href=url), + p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'), style='text-align:left; max-width: 100%; overflow: hidden;') p[0].tail = ' from ' navbar.append(p) @@ -390,7 +390,7 @@ class TouchscreenNavBarTemplate(Template): if bottom and not url.startswith('file://'): navbar.append(HR()) text = 'This article was downloaded by ' - p = PT(text, STRONG(__appname__), A(url, href=url), + p = PT(text, STRONG(__appname__), A(url, href=url, rel='calibre-downloaded-from'), style='text-align:left; max-width: 100%; overflow: hidden;') p[0].tail = ' from ' navbar.append(p)