From 555af8ab0ea924adf1aaab36e7326fce4f593fde Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 9 May 2020 21:55:39 +0530
Subject: [PATCH] Update Ars Technica

---
 recipes/ars_technica.recipe | 62 ++++++++++++-------------------------
 1 file changed, 19 insertions(+), 43 deletions(-)
diff --git a/recipes/ars_technica.recipe b/recipes/ars_technica.recipe
index 1b4d04ccd1..802e0e8519 100644
--- a/recipes/ars_technica.recipe
+++ b/recipes/ars_technica.recipe
@@ -6,7 +6,12 @@ arstechnica.com
 
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+
+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={
+        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 
 
 class ArsTechnica(BasicNewsRecipe):
@@ -33,12 +38,13 @@ class ArsTechnica(BasicNewsRecipe):
     '''
 
     keep_only_tags = [
-        dict(itemprop=['headline', 'description']), dict(attrs={'class': ['post-meta', 'article-guts', 'standalone']})
+        dict(itemprop=['headline', 'description']),
+        classes('post-meta article-guts standalone'),
     ]
 
     remove_tags = [
+        classes('site-header video corner-info article-expander left-column related-stories'),
         dict(name=['object', 'link', 'embed', 'iframe', 'meta']),
-        dict(attrs={'class': ['video', 'corner-info', 'article-expander']}),
         dict(id=['social-left', 'article-footer-wrap']),
         dict(name='nav', attrs={'class': 'subheading'}),
     ]
@@ -66,45 +72,15 @@ class ArsTechnica(BasicNewsRecipe):
         ('Internet', 'http://feeds.arstechnica.com/arstechnica/web'),
     ]
 
-    def append_page(self, soup, appendtag, position):
-        pager = soup.find(attrs={'class': 'numbers'})
-        if pager:
-            nexttag = pager.find(attrs={'class': 'next'})
-            if nexttag:
-                nurl = nexttag.parent['href']
-                rawc = self.index_to_soup(nurl, True)
-                soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
-                texttag = soup2.find(attrs={'class': 'article-guts'})
-                if texttag is not None:
-                    newpos = len(texttag.contents)
-                    soup = self.append_page(soup2, texttag, newpos)
-                    texttag.extract()
-                    pager.extract()
-                    appendtag.insert(position, texttag)
-                    soup = BeautifulSoup(soup.renderContents().decode('utf-8'))
-        return soup
+    recursions = 1
 
-    def preprocess_html(self, soup):
-        soup = self.append_page(soup, soup.body, 3)
-        for item in soup.findAll('a'):
-            limg = item.find('img')
-            if item.string is not None:
-                str = item.string
-                item.replaceWith(str)
-            else:
-                if limg:
-                    item.name = 'div'
-                    item.attrs.clear()
-                else:
-                    str = self.tag_to_string(item)
-                    item.replaceWith(str)
-        for div in soup.findAll('div', attrs={'class':'image', 'style':lambda x: x and 'background-image' in x}):
-            url = re.search(r'''url\(['"]?([^'")]+)''', div['style'])
-            if url is not None:
-                div.name = 'img'
-                div['src'] = url.group(1)
-                div['style'] = ''
-        return soup
+    def is_link_wanted(self, url, tag):
+        return re.search(r'/[0-9]/$', url) is not None
 
-    def preprocess_raw_html(self, raw, url):
-        return '<html><head>' + raw[raw.find('</head>'):]
+    def postprocess_html(self, soup, first_fetch):
+        if not first_fetch:
+            for x in soup.findAll(itemprop=['headline', 'description']):
+                x.extract()
+            for x in soup.findAll(**classes('post-meta')):
+                x.extract()
+        return soup