Update Ars Technica

2025-08-30 23:00:21 -04:00 · 2020-05-09 21:55:39 +05:30 · 2020-05-09 21:55:39 +05:30 · 555af8ab0e
commit 555af8ab0e
parent 8c1de2a921
1 changed files with 19 additions and 43 deletions
--- a/recipes/ars_technica.recipe
+++ b/recipes/ars_technica.recipe
@ -6,7 +6,12 @@ arstechnica.com
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 class ArsTechnica(BasicNewsRecipe):
@ -33,12 +38,13 @@ class ArsTechnica(BasicNewsRecipe):
    '''
    keep_only_tags = [
-        dict(itemprop=['headline', 'description']), dict(attrs={'class': ['post-meta', 'article-guts', 'standalone']})
+        dict(itemprop=['headline', 'description']),
        classes('post-meta article-guts standalone'),
    ]
    remove_tags = [
        classes('site-header video corner-info article-expander left-column related-stories'),
        dict(name=['object', 'link', 'embed', 'iframe', 'meta']),
        dict(attrs={'class': ['video', 'corner-info', 'article-expander']}),
        dict(id=['social-left', 'article-footer-wrap']),
        dict(name='nav', attrs={'class': 'subheading'}),
    ]
@ -66,45 +72,15 @@ class ArsTechnica(BasicNewsRecipe):
        ('Internet', 'http://feeds.arstechnica.com/arstechnica/web'),
    ]
-    def append_page(self, soup, appendtag, position):
+    recursions = 1
        pager = soup.find(attrs={'class': 'numbers'})
        if pager:
            nexttag = pager.find(attrs={'class': 'next'})
            if nexttag:
                nurl = nexttag.parent['href']
                rawc = self.index_to_soup(nurl, True)
                soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
                texttag = soup2.find(attrs={'class': 'article-guts'})
                if texttag is not None:
                    newpos = len(texttag.contents)
                    soup = self.append_page(soup2, texttag, newpos)
                    texttag.extract()
                    pager.extract()
                    appendtag.insert(position, texttag)
                    soup = BeautifulSoup(soup.renderContents().decode('utf-8'))
        return soup
-    def preprocess_html(self, soup):
+    def is_link_wanted(self, url, tag):
-        soup = self.append_page(soup, soup.body, 3)
+        return re.search(r'/[0-9]/$', url) is not None
        for item in soup.findAll('a'):
            limg = item.find('img')
            if item.string is not None:
                str = item.string
                item.replaceWith(str)
            else:
                if limg:
                    item.name = 'div'
                    item.attrs.clear()
                else:
                    str = self.tag_to_string(item)
                    item.replaceWith(str)
        for div in soup.findAll('div', attrs={'class':'image', 'style':lambda x: x and 'background-image' in x}):
            url = re.search(r'''url\(['"]?([^'")]+)''', div['style'])
            if url is not None:
                div.name = 'img'
                div['src'] = url.group(1)
                div['style'] = ''
        return soup
-    def preprocess_raw_html(self, raw, url):
+    def postprocess_html(self, soup, first_fetch):
-        return '<html><head>' + raw[raw.find('</head>'):]
+        if not first_fetch:
            for x in soup.findAll(itemprop=['headline', 'description']):
                x.extract()
            for x in soup.findAll(**classes('post-meta')):
                x.extract()
        return soup