From 943f6eef2eb649a171e13bce077a42342b4747f3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 3 Sep 2016 09:00:47 +0530 Subject: [PATCH] Update Ars Technica --- recipes/ars_technica.recipe | 53 ++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/recipes/ars_technica.recipe b/recipes/ars_technica.recipe index dd385484f8..fd96002386 100644 --- a/recipes/ars_technica.recipe +++ b/recipes/ars_technica.recipe @@ -4,6 +4,7 @@ __copyright__ = '2008-2012, Darko Miletic ' arstechnica.com ''' +import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -14,36 +15,34 @@ class ArsTechnica(BasicNewsRecipe): __author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks' description = 'Ars Technica: Serving the technologist for 1.2 decades' publisher = 'Conde Nast Publications' - category = 'news, IT, technology' oldest_article = 5 max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False remove_empty_feeds = True - publication_type = 'newsportal' extra_css = ''' - body {font-family: Arial,sans-serif} - .heading{font-family: "Times New Roman",serif} - .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} - img{display: block} - .caption-text{font-size:small; font-style:italic} - .caption-byline{font-size:small; font-style:italic; font-weight:bold} + body {font-family: Arial,sans-serif} + .heading{font-family: "Times New Roman",serif} + .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} + img{display: block} + .caption-text{font-size:small; font-style:italic} + .caption-byline{font-size:small; font-style:italic; font-weight:bold} + .video, .page-numbers, .story-sidebar { display: none } + .image { display: block } ''' - conversion_options = { - 'comments': description, 'tags': category, 'language': language, 'publisher': publisher - } - keep_only_tags = [ - dict(attrs={'class': 'standalone'}), dict(attrs={'id': 'article-guts'}) + dict(itemprop=['headline', 'description']), dict(attrs={'class': ['post-meta', 'article-guts', 'standalone']}) ] remove_tags = [ - dict(name=['object', 'link', 'embed', 'iframe', 'meta']), dict(attrs={'class': 'corner-info'}), dict(attrs={ - 'id': 'article-footer-wrap'}), dict(attrs={'class': 'article-expander'}), dict(name='nav', attrs={'class': 'subheading'}) + dict(name=['object', 'link', 'embed', 'iframe', 'meta']), + dict(attrs={'class': ['video', 'corner-info', 'article-expander']}), + dict(id=['social-left', 'article-footer-wrap']), + dict(name='nav', attrs={'class': 'subheading'}), ] - remove_attributes = ['lang'] + remove_attributes = ['lang', 'style'] feeds = [ (u'Ars Features (All our long-form feature articles)', u'http://feeds.arstechnica.com/arstechnica/features'), @@ -68,12 +67,13 @@ class ArsTechnica(BasicNewsRecipe): nurl = nexttag.parent['href'] rawc = self.index_to_soup(nurl, True) soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding) - texttag = soup2.find(attrs={'id': 'article-guts'}) - newpos = len(texttag.contents) - self.append_page(soup2, texttag, newpos) - texttag.extract() - pager.extract() - appendtag.insert(position, texttag) + texttag = soup2.find(attrs={'class': 'article-guts'}) + if texttag is not None: + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + pager.extract() + appendtag.insert(position, texttag) def preprocess_html(self, soup): self.append_page(soup, soup.body, 3) @@ -89,9 +89,12 @@ class ArsTechnica(BasicNewsRecipe): else: str = self.tag_to_string(item) item.replaceWith(str) - for item in soup.findAll('img'): - if 'alt' not in item: - item['alt'] = 'image' + for div in soup.findAll('div', attrs={'class':'image', 'style':lambda x: x and 'background-image' in x}): + url = re.search(r'''url\(['"]?([^'")]+)''', div['style']) + if url is not None: + div.name = 'img' + div['src'] = url.group(1) + div['style'] = '' return soup def preprocess_raw_html(self, raw, url):