Allow using get_obfuscated-article without using temp files

2025-08-30 23:00:21 -04:00 · 2023-10-16 11:25:40 +05:30 · 2023-10-16 11:25:40 +05:30 · 347b911b8d
commit 347b911b8d
parent 41e9ac0840
2 changed files with 25 additions and 4 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -852,6 +852,11 @@ class BasicNewsRecipe(Recipe):
        every article URL. It should return the path to a file on the filesystem
        that contains the article HTML. That file is processed by the recursive
        HTML fetching engine, so it can contain links to pages/images on the web.
+        Alternately, you can return a dictionary of the form:
+        {'data': <HTML data>, 'url': <the resolved URL of the article>}. This avoids
+        needing to create temporary files. The `url` key in the dictionary is useful if
+        the effective URL of the article is different from the URL passed into this method,
+        for example, because of redirects. It can be omitted if the URL is unchanged.

        This method is typically useful for sites that try to make it difficult to
        access article content automatically.
@ -1163,7 +1168,7 @@ class BasicNewsRecipe(Recipe):
        return templ.generate(f, feeds, self.description_limiter,
                              extra_css=css).render(doctype='xhtml')

-    def _fetch_article(self, url, dir_, f, a, num_of_feeds):
+    def _fetch_article(self, url, dir_, f, a, num_of_feeds, preloaded=None):
        br = self.browser
        if hasattr(self.get_browser, 'is_base_class_implementation'):
            # We are using the default get_browser, which means no need to
@ -1180,6 +1185,8 @@ class BasicNewsRecipe(Recipe):
        fetcher.current_dir = dir_
        fetcher.show_progress = False
        fetcher.image_url_processor = self.image_url_processor
+        if preloaded is not None:
+            fetcher.preloaded_urls[url] = preloaded
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
        if not res or not os.path.exists(res):
            msg = _('Could not fetch article.') + ' '
@ -1195,9 +1202,17 @@ class BasicNewsRecipe(Recipe):
        return self._fetch_article(url, dir, f, a, num_of_feeds)

    def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
-        path = os.path.abspath(self.get_obfuscated_article(url))
-        url = ('file:'+path) if iswindows else ('file://'+path)
-        return self._fetch_article(url, dir, f, a, num_of_feeds)
+        x = self.get_obfuscated_article(url)
+        if isinstance(x, dict):
+            data = x['data']
+            if isinstance(data, str):
+                data = data.encode(self.encoding or 'utf-8')
+            url = data.get('url', url)
+        else:
+            with open(x, 'rb') as of:
+                data = of.read()
+            os.remove(x)
+        return self._fetch_article(url, dir, f, a, num_of_feeds, preloaded=data)

    def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
        templ = templates.EmbeddedContent()
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -185,6 +185,7 @@ class RecursiveFetcher:
        self.show_progress = True
        self.failed_links = []
        self.job_info = job_info
+        self.preloaded_urls = {}

    def get_soup(self, src, url=None):
        nmassage = []
@ -245,6 +246,11 @@ class RecursiveFetcher:

    def fetch_url(self, url):
        data = None
+        q = self.preloaded_urls.pop(url, None)
+        if q is not None:
+            ans = response(q)
+            ans.newurl = url
+            return ans
        self.log.debug('Fetching', url)
        st = time.monotonic()