Allow saving index html easily

This commit is contained in:
Kovid Goyal 2018-11-05 12:43:20 +05:30
parent 661c47501a
commit 6ce808c499
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -657,7 +657,7 @@ class BasicNewsRecipe(Recipe):
return frozenset() return frozenset()
return frozenset([(parts.netloc, (parts.path or '').rstrip('/'))]) return frozenset([(parts.netloc, (parts.path or '').rstrip('/'))])
def index_to_soup(self, url_or_raw, raw=False, as_tree=False): def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
''' '''
Convenience method that takes an URL to the index page and returns Convenience method that takes an URL to the index page and returns
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
@ -692,6 +692,9 @@ class BasicNewsRecipe(Recipe):
else: else:
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0] _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
_raw = clean_xml_chars(_raw) _raw = clean_xml_chars(_raw)
if save_raw:
with lopen(save_raw, 'wb') as f:
f.write(_raw.encode('utf-8'))
if as_tree: if as_tree:
from html5_parser import parse from html5_parser import parse
return parse(_raw) return parse(_raw)