Allow using get_obfuscated-article without using temp files

This commit is contained in:
Kovid Goyal 2023-10-16 11:25:40 +05:30
parent 41e9ac0840
commit 347b911b8d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 25 additions and 4 deletions

View File

@ -852,6 +852,11 @@ class BasicNewsRecipe(Recipe):
every article URL. It should return the path to a file on the filesystem
that contains the article HTML. That file is processed by the recursive
HTML fetching engine, so it can contain links to pages/images on the web.
Alternately, you can return a dictionary of the form:
{'data': <HTML data>, 'url': <the resolved URL of the article>}. This avoids
needing to create temporary files. The `url` key in the dictionary is useful if
the effective URL of the article is different from the URL passed into this method,
for example, because of redirects. It can be omitted if the URL is unchanged.
This method is typically useful for sites that try to make it difficult to
access article content automatically.
@ -1163,7 +1168,7 @@ class BasicNewsRecipe(Recipe):
return templ.generate(f, feeds, self.description_limiter,
extra_css=css).render(doctype='xhtml')
def _fetch_article(self, url, dir_, f, a, num_of_feeds):
def _fetch_article(self, url, dir_, f, a, num_of_feeds, preloaded=None):
br = self.browser
if hasattr(self.get_browser, 'is_base_class_implementation'):
# We are using the default get_browser, which means no need to
@ -1180,6 +1185,8 @@ class BasicNewsRecipe(Recipe):
fetcher.current_dir = dir_
fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor
if preloaded is not None:
fetcher.preloaded_urls[url] = preloaded
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res or not os.path.exists(res):
msg = _('Could not fetch article.') + ' '
@ -1195,9 +1202,17 @@ class BasicNewsRecipe(Recipe):
return self._fetch_article(url, dir, f, a, num_of_feeds)
def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
path = os.path.abspath(self.get_obfuscated_article(url))
url = ('file:'+path) if iswindows else ('file://'+path)
return self._fetch_article(url, dir, f, a, num_of_feeds)
x = self.get_obfuscated_article(url)
if isinstance(x, dict):
data = x['data']
if isinstance(data, str):
data = data.encode(self.encoding or 'utf-8')
url = data.get('url', url)
else:
with open(x, 'rb') as of:
data = of.read()
os.remove(x)
return self._fetch_article(url, dir, f, a, num_of_feeds, preloaded=data)
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
templ = templates.EmbeddedContent()

View File

@ -185,6 +185,7 @@ class RecursiveFetcher:
self.show_progress = True
self.failed_links = []
self.job_info = job_info
self.preloaded_urls = {}
def get_soup(self, src, url=None):
nmassage = []
@ -245,6 +246,11 @@ class RecursiveFetcher:
def fetch_url(self, url):
data = None
q = self.preloaded_urls.pop(url, None)
if q is not None:
ans = response(q)
ans.newurl = url
return ans
self.log.debug('Fetching', url)
st = time.monotonic()