mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update The Guardian
This commit is contained in:
parent
e38ae0e58e
commit
84ed6ac3af
@ -43,9 +43,6 @@ class Guardian(BasicNewsRecipe):
|
|||||||
|
|
||||||
timefmt = ' [%a, %d %b %Y]'
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(attrs={'class': lambda x: x and 'content__main-column' in x.split()}),
|
|
||||||
]
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class': lambda x: x and '--twitter' in x}),
|
dict(attrs={'class': lambda x: x and '--twitter' in x}),
|
||||||
dict(attrs={'class': lambda x: x and 'submeta' in x.split()}),
|
dict(attrs={'class': lambda x: x and 'submeta' in x.split()}),
|
||||||
@ -71,12 +68,22 @@ class Guardian(BasicNewsRecipe):
|
|||||||
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
|
||||||
import html5lib
|
|
||||||
from lxml import html
|
|
||||||
return html.tostring(html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml'), encoding='unicode')
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
# with open('/t/raw.html', 'w') as f:
|
||||||
|
# f.write(str(soup))
|
||||||
|
old_body = soup.find('body')
|
||||||
|
if old_body is not None:
|
||||||
|
main_column = soup.find(**classes('content__main-column'))
|
||||||
|
if main_column is None:
|
||||||
|
for section in soup.findAll('section'):
|
||||||
|
if section.find('h1') is not None:
|
||||||
|
main_column = section
|
||||||
|
break
|
||||||
|
if main_column is not None:
|
||||||
|
body = soup.new_tag('body')
|
||||||
|
body.append(main_column)
|
||||||
|
old_body.replaceWith(body)
|
||||||
|
|
||||||
for img in soup.findAll('img', srcset=True):
|
for img in soup.findAll('img', srcset=True):
|
||||||
img['src'] = img['srcset'].partition(' ')[0]
|
img['src'] = img['srcset'].partition(' ')[0]
|
||||||
img['srcset'] = ''
|
img['srcset'] = ''
|
||||||
@ -100,6 +107,9 @@ class Guardian(BasicNewsRecipe):
|
|||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
# return [('All articles', [
|
||||||
|
# {'title': 'XXXXX', 'url': 'https://www.theguardian.com/politics/2020/dec/01/uk-likely-to-axe-finance-bill-clauses-if-brexit-trade-deal-made'},
|
||||||
|
# ])]
|
||||||
feeds = self.parse_section(self.base_url)
|
feeds = self.parse_section(self.base_url)
|
||||||
feeds += self.parse_section(
|
feeds += self.parse_section(
|
||||||
'https://www.theguardian.com/uk/sport', 'Sport - ')
|
'https://www.theguardian.com/uk/sport', 'Sport - ')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user