diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 2377feb3a2..6fd1d40bbe 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -852,6 +852,11 @@ class BasicNewsRecipe(Recipe): every article URL. It should return the path to a file on the filesystem that contains the article HTML. That file is processed by the recursive HTML fetching engine, so it can contain links to pages/images on the web. + Alternately, you can return a dictionary of the form: + {'data': , 'url': }. This avoids + needing to create temporary files. The `url` key in the dictionary is useful if + the effective URL of the article is different from the URL passed into this method, + for example, because of redirects. It can be omitted if the URL is unchanged. This method is typically useful for sites that try to make it difficult to access article content automatically. @@ -1163,7 +1168,7 @@ class BasicNewsRecipe(Recipe): return templ.generate(f, feeds, self.description_limiter, extra_css=css).render(doctype='xhtml') - def _fetch_article(self, url, dir_, f, a, num_of_feeds): + def _fetch_article(self, url, dir_, f, a, num_of_feeds, preloaded=None): br = self.browser if hasattr(self.get_browser, 'is_base_class_implementation'): # We are using the default get_browser, which means no need to @@ -1180,6 +1185,8 @@ class BasicNewsRecipe(Recipe): fetcher.current_dir = dir_ fetcher.show_progress = False fetcher.image_url_processor = self.image_url_processor + if preloaded is not None: + fetcher.preloaded_urls[url] = preloaded res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links if not res or not os.path.exists(res): msg = _('Could not fetch article.') + ' ' @@ -1195,9 +1202,17 @@ class BasicNewsRecipe(Recipe): return self._fetch_article(url, dir, f, a, num_of_feeds) def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds): - path = os.path.abspath(self.get_obfuscated_article(url)) - url = ('file:'+path) if iswindows else ('file://'+path) - return self._fetch_article(url, dir, f, a, num_of_feeds) + x = self.get_obfuscated_article(url) + if isinstance(x, dict): + data = x['data'] + if isinstance(data, str): + data = data.encode(self.encoding or 'utf-8') + url = data.get('url', url) + else: + with open(x, 'rb') as of: + data = of.read() + os.remove(x) + return self._fetch_article(url, dir, f, a, num_of_feeds, preloaded=data) def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): templ = templates.EmbeddedContent() diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index e325f0e90c..050e6dd36c 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -185,6 +185,7 @@ class RecursiveFetcher: self.show_progress = True self.failed_links = [] self.job_info = job_info + self.preloaded_urls = {} def get_soup(self, src, url=None): nmassage = [] @@ -245,6 +246,11 @@ class RecursiveFetcher: def fetch_url(self, url): data = None + q = self.preloaded_urls.pop(url, None) + if q is not None: + ans = response(q) + ans.newurl = url + return ans self.log.debug('Fetching', url) st = time.monotonic()