mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Update The Guardian
This commit is contained in:
parent
2db8cac86a
commit
c82f04c3b1
@ -40,15 +40,22 @@ class Guardian(BasicNewsRecipe):
|
||||
]
|
||||
remove_tags = [
|
||||
dict(attrs={'class': lambda x:x and '--twitter' in x}),
|
||||
dict(attrs={'class': lambda x:x and 'submeta' in x.split()}),
|
||||
dict(attrs={'data-component': ['share', 'social']}),
|
||||
dict(attrs={'data-link-name': 'block share'}),
|
||||
dict(attrs={'class': lambda x:x and 'inline-expand-image' in x}),
|
||||
dict(attrs={'class': lambda x:x and 'modern-visible' in x.split()}),
|
||||
dict(name=['link', 'meta', 'style']),
|
||||
]
|
||||
remove_tags_after = [
|
||||
dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}),
|
||||
]
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
import html5lib
|
||||
from lxml import html
|
||||
return html.tostring(html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml'), encoding=unicode)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for img in soup.findAll('img', srcset=True):
|
||||
img['src'] = img['srcset'].partition(' ')[0]
|
||||
|
Loading…
x
Reference in New Issue
Block a user