Update Ars Technica

This commit is contained in:
Kovid Goyal 2020-05-09 21:55:39 +05:30
parent 8c1de2a921
commit 555af8ab0e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -6,7 +6,12 @@ arstechnica.com
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class ArsTechnica(BasicNewsRecipe): class ArsTechnica(BasicNewsRecipe):
@ -33,12 +38,13 @@ class ArsTechnica(BasicNewsRecipe):
''' '''
keep_only_tags = [ keep_only_tags = [
dict(itemprop=['headline', 'description']), dict(attrs={'class': ['post-meta', 'article-guts', 'standalone']}) dict(itemprop=['headline', 'description']),
classes('post-meta article-guts standalone'),
] ]
remove_tags = [ remove_tags = [
classes('site-header video corner-info article-expander left-column related-stories'),
dict(name=['object', 'link', 'embed', 'iframe', 'meta']), dict(name=['object', 'link', 'embed', 'iframe', 'meta']),
dict(attrs={'class': ['video', 'corner-info', 'article-expander']}),
dict(id=['social-left', 'article-footer-wrap']), dict(id=['social-left', 'article-footer-wrap']),
dict(name='nav', attrs={'class': 'subheading'}), dict(name='nav', attrs={'class': 'subheading'}),
] ]
@ -66,45 +72,15 @@ class ArsTechnica(BasicNewsRecipe):
('Internet', 'http://feeds.arstechnica.com/arstechnica/web'), ('Internet', 'http://feeds.arstechnica.com/arstechnica/web'),
] ]
def append_page(self, soup, appendtag, position): recursions = 1
pager = soup.find(attrs={'class': 'numbers'})
if pager:
nexttag = pager.find(attrs={'class': 'next'})
if nexttag:
nurl = nexttag.parent['href']
rawc = self.index_to_soup(nurl, True)
soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
texttag = soup2.find(attrs={'class': 'article-guts'})
if texttag is not None:
newpos = len(texttag.contents)
soup = self.append_page(soup2, texttag, newpos)
texttag.extract()
pager.extract()
appendtag.insert(position, texttag)
soup = BeautifulSoup(soup.renderContents().decode('utf-8'))
return soup
def preprocess_html(self, soup): def is_link_wanted(self, url, tag):
soup = self.append_page(soup, soup.body, 3) return re.search(r'/[0-9]/$', url) is not None
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs.clear()
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for div in soup.findAll('div', attrs={'class':'image', 'style':lambda x: x and 'background-image' in x}):
url = re.search(r'''url\(['"]?([^'")]+)''', div['style'])
if url is not None:
div.name = 'img'
div['src'] = url.group(1)
div['style'] = ''
return soup
def preprocess_raw_html(self, raw, url): def postprocess_html(self, soup, first_fetch):
return '<html><head>' + raw[raw.find('</head>'):] if not first_fetch:
for x in soup.findAll(itemprop=['headline', 'description']):
x.extract()
for x in soup.findAll(**classes('post-meta')):
x.extract()
return soup