Update The Guardian

This commit is contained in:
Kovid Goyal 2020-12-02 07:24:15 +05:30
parent e38ae0e58e
commit 84ed6ac3af
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -43,9 +43,6 @@ class Guardian(BasicNewsRecipe):
timefmt = ' [%a, %d %b %Y]' timefmt = ' [%a, %d %b %Y]'
keep_only_tags = [
dict(attrs={'class': lambda x: x and 'content__main-column' in x.split()}),
]
remove_tags = [ remove_tags = [
dict(attrs={'class': lambda x: x and '--twitter' in x}), dict(attrs={'class': lambda x: x and '--twitter' in x}),
dict(attrs={'class': lambda x: x and 'submeta' in x.split()}), dict(attrs={'class': lambda x: x and 'submeta' in x.split()}),
@ -71,12 +68,22 @@ class Guardian(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self, *a, **kw) br = BasicNewsRecipe.get_browser(self, *a, **kw)
return br return br
def preprocess_raw_html(self, raw, url):
import html5lib
from lxml import html
return html.tostring(html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml'), encoding='unicode')
def preprocess_html(self, soup): def preprocess_html(self, soup):
# with open('/t/raw.html', 'w') as f:
# f.write(str(soup))
old_body = soup.find('body')
if old_body is not None:
main_column = soup.find(**classes('content__main-column'))
if main_column is None:
for section in soup.findAll('section'):
if section.find('h1') is not None:
main_column = section
break
if main_column is not None:
body = soup.new_tag('body')
body.append(main_column)
old_body.replaceWith(body)
for img in soup.findAll('img', srcset=True): for img in soup.findAll('img', srcset=True):
img['src'] = img['srcset'].partition(' ')[0] img['src'] = img['srcset'].partition(' ')[0]
img['srcset'] = '' img['srcset'] = ''
@ -100,6 +107,9 @@ class Guardian(BasicNewsRecipe):
return feeds return feeds
def parse_index(self): def parse_index(self):
# return [('All articles', [
# {'title': 'XXXXX', 'url': 'https://www.theguardian.com/politics/2020/dec/01/uk-likely-to-axe-finance-bill-clauses-if-brexit-trade-deal-made'},
# ])]
feeds = self.parse_section(self.base_url) feeds = self.parse_section(self.base_url)
feeds += self.parse_section( feeds += self.parse_section(
'https://www.theguardian.com/uk/sport', 'Sport - ') 'https://www.theguardian.com/uk/sport', 'Sport - ')