Allow using get_obfuscated-article without using temp files

This commit is contained in:
Kovid Goyal 2023-10-16 11:25:40 +05:30
parent 41e9ac0840
commit 347b911b8d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 25 additions and 4 deletions

View File

@ -852,6 +852,11 @@ class BasicNewsRecipe(Recipe):
every article URL. It should return the path to a file on the filesystem every article URL. It should return the path to a file on the filesystem
that contains the article HTML. That file is processed by the recursive that contains the article HTML. That file is processed by the recursive
HTML fetching engine, so it can contain links to pages/images on the web. HTML fetching engine, so it can contain links to pages/images on the web.
Alternately, you can return a dictionary of the form:
{'data': <HTML data>, 'url': <the resolved URL of the article>}. This avoids
needing to create temporary files. The `url` key in the dictionary is useful if
the effective URL of the article is different from the URL passed into this method,
for example, because of redirects. It can be omitted if the URL is unchanged.
This method is typically useful for sites that try to make it difficult to This method is typically useful for sites that try to make it difficult to
access article content automatically. access article content automatically.
@ -1163,7 +1168,7 @@ class BasicNewsRecipe(Recipe):
return templ.generate(f, feeds, self.description_limiter, return templ.generate(f, feeds, self.description_limiter,
extra_css=css).render(doctype='xhtml') extra_css=css).render(doctype='xhtml')
def _fetch_article(self, url, dir_, f, a, num_of_feeds): def _fetch_article(self, url, dir_, f, a, num_of_feeds, preloaded=None):
br = self.browser br = self.browser
if hasattr(self.get_browser, 'is_base_class_implementation'): if hasattr(self.get_browser, 'is_base_class_implementation'):
# We are using the default get_browser, which means no need to # We are using the default get_browser, which means no need to
@ -1180,6 +1185,8 @@ class BasicNewsRecipe(Recipe):
fetcher.current_dir = dir_ fetcher.current_dir = dir_
fetcher.show_progress = False fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor fetcher.image_url_processor = self.image_url_processor
if preloaded is not None:
fetcher.preloaded_urls[url] = preloaded
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res or not os.path.exists(res): if not res or not os.path.exists(res):
msg = _('Could not fetch article.') + ' ' msg = _('Could not fetch article.') + ' '
@ -1195,9 +1202,17 @@ class BasicNewsRecipe(Recipe):
return self._fetch_article(url, dir, f, a, num_of_feeds) return self._fetch_article(url, dir, f, a, num_of_feeds)
def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds): def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
path = os.path.abspath(self.get_obfuscated_article(url)) x = self.get_obfuscated_article(url)
url = ('file:'+path) if iswindows else ('file://'+path) if isinstance(x, dict):
return self._fetch_article(url, dir, f, a, num_of_feeds) data = x['data']
if isinstance(data, str):
data = data.encode(self.encoding or 'utf-8')
url = data.get('url', url)
else:
with open(x, 'rb') as of:
data = of.read()
os.remove(x)
return self._fetch_article(url, dir, f, a, num_of_feeds, preloaded=data)
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
templ = templates.EmbeddedContent() templ = templates.EmbeddedContent()

View File

@ -185,6 +185,7 @@ class RecursiveFetcher:
self.show_progress = True self.show_progress = True
self.failed_links = [] self.failed_links = []
self.job_info = job_info self.job_info = job_info
self.preloaded_urls = {}
def get_soup(self, src, url=None): def get_soup(self, src, url=None):
nmassage = [] nmassage = []
@ -245,6 +246,11 @@ class RecursiveFetcher:
def fetch_url(self, url): def fetch_url(self, url):
data = None data = None
q = self.preloaded_urls.pop(url, None)
if q is not None:
ans = response(q)
ans.newurl = url
return ans
self.log.debug('Fetching', url) self.log.debug('Fetching', url)
st = time.monotonic() st = time.monotonic()