From c410bb9ea9803d811f892c01352b7732411f8fb4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 May 2009 13:08:55 -0700 Subject: [PATCH] Fix #2497 (Scientific American recipe not working) --- src/calibre/web/feeds/recipes/__init__.py | 2 +- .../web/feeds/recipes/recipe_le_temps.py | 88 ++++++++++ .../recipes/recipe_scientific_american.py | 154 +++++++----------- .../web/feeds/recipes/recipe_woz_die.py | 46 ++++++ 4 files changed, 196 insertions(+), 94 deletions(-) create mode 100644 src/calibre/web/feeds/recipes/recipe_le_temps.py create mode 100644 src/calibre/web/feeds/recipes/recipe_woz_die.py diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 5f78801493..bd1b5098d9 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -43,7 +43,7 @@ recipe_modules = ['recipe_' + r for r in ( 'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms', 'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews', 'straitstimes', 'index_hu', 'pcworld_hu', 'hrt', 'rts', - 'h1', 'h2', 'h3', 'phd_comics', + 'h1', 'h2', 'h3', 'phd_comics', 'woz_die', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_le_temps.py b/src/calibre/web/feeds/recipes/recipe_le_temps.py new file mode 100644 index 0000000000..eb0b6c5a64 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_le_temps.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + +from calibre.web.feeds.news import BasicNewsRecipe + +class LeTemps(BasicNewsRecipe): + title = u'Le Temps' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_tags = [dict(name='div', attrs={'id':'footer'})] + remove_tags = [dict(name='div', attrs={'class':'box links'})] + remove_tags = [dict(name='script')] + extra_css = '''.heading {font-size: 13px; line-height: 15px; + margin: 20px 0;} \n h2 {font-size: 24px; line-height: 25px; margin-bottom: + 14px;} \n .author {font-size: 11px; margin: 0 0 5px 0;} \n .lead {font- + weight: 700; margin: 10px 0;} \n p {margin: 0 0 10px 0;}''' + + feeds = [ + ('Actualité', + 'http://www.letemps.ch/rss/site/'), + ('Monde', + 'http://www.letemps.ch/rss/site/actualite/monde'), + ('Suisse & Régions', + 'http://www.letemps.ch/rss/site/actualite/suisse_regions'), + ('Sciences & Environnement', + 'http://www.letemps.ch/rss/site/actualite/sciences_environnement'), + ('Société', + 'http://www.letemps.ch/rss/site/actualite/societe'), + ('Economie & Finance', + 'http://www.letemps.ch/rss/site/economie_finance'), + ('Economie & Finance - Finance', + 'http://www.letemps.ch/rss/site/economie_finance/finance'), + ('Economie & Finance - Fonds de placement', + 'http://www.letemps.ch/rss/site/economie_finance/fonds_placement'), + ('Economie & Finance - Carrières', + 'http://www.letemps.ch/rss/site/economie_finance/carrieres'), + ('Culture', + 'http://www.letemps.ch/rss/site/culture'), + ('Culture - Cinéma', + 'http://www.letemps.ch/rss/site/culture/cinema'), + ('Culture - Musiques', + 'http://www.letemps.ch/rss/site/culture/musiques'), + ('Culture - Scènes', + 'http://www.letemps.ch/rss/site/culture/scenes'), + ('Culture - Arts plastiques', + 'http://www.letemps.ch/rss/site/culture/arts_plastiques'), + ('Livres', + 'http://www.letemps.ch/rss/site/culture/livres'), + ('Opinions', + 'http://www.letemps.ch/rss/site/opinions'), + ('Opinions - Editoriaux', + 'http://www.letemps.ch/rss/site/opinions/editoriaux'), + ('Opinions - Invités', + 'http://www.letemps.ch/rss/site/opinions/invites'), + ('Opinions - Chroniques', + 'http://www.letemps.ch/rss/site/opinions/chroniques'), + ('LifeStyle', + 'http://www.letemps.ch/rss/site/lifestyle'), + ('LifeStyle - Luxe', + 'http://www.letemps.ch/rss/site/lifestyle/luxe'), + ('LifeStyle - Horlogerie & Joaillerie', + 'http://www.letemps.ch/rss/site/lifestyle/horlogerie_joaillerie'), + ('LifeStyle - Design', + 'http://www.letemps.ch/rss/site/lifestyle/design'), + ('LifeStyle - Voyages', + 'http://www.letemps.ch/rss/site/lifestyle/voyages'), + ('LifeStyle - Gastronomie', + 'http://www.letemps.ch/rss/site/lifestyle/gastronomie'), + ('LifeStyle - Architecture & Immobilier', + 'http://www.letemps.ch/rss/site/lifestyle/architecture_immobilier'), + ('LifeStyle - Automobile', + 'http://www.letemps.ch/rss/site/lifestyle/automobile'), + ('Sports', + 'http://www.letemps.ch/rss/site/actualite/sports'), + ] + + def print_version(self, url): + return url.replace('Page', 'Facet/print') + + diff --git a/src/calibre/web/feeds/recipes/recipe_scientific_american.py b/src/calibre/web/feeds/recipes/recipe_scientific_american.py index e2bce46f2f..8d706e8416 100644 --- a/src/calibre/web/feeds/recipes/recipe_scientific_american.py +++ b/src/calibre/web/feeds/recipes/recipe_scientific_american.py @@ -7,124 +7,92 @@ __docformat__ = 'restructuredtext en' sciam.com ''' import re -from lxml import html from calibre.web.feeds.news import BasicNewsRecipe class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' - description = u'Popular science. Monthly magazine.' + description = u'Popular science. Monthly magazine.' __author__ = 'Kovid Goyal' language = _('English') - oldest_article = 30 + oldest_article = 30 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False remove_tags_before = dict(name='div', attrs={'class':'headline'}) - remove_tags_after = dict(id='article') + remove_tags_after = dict(id=['article']) remove_tags = [ - dict(id=['sharetools', 'reddit']), + dict(id=['sharetools', 'reddit']), dict(name='script'), + {'class':['float_left', 'atools']}, {"class": re.compile(r'also-in-this')} ] html2lrf_options = ['--base-font-size', '8'] recursions = 1 - match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)'] -# feeds = [ -# (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), -# (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), -# (u'Health', u'http://rss.sciam.com/sciam/health'), -# (u'Space', u'http://rss.sciam.com/sciam/space'), -# (u'Technology', u'http://rss.sciam.com/sciam/technology'), -# (u'Biology', u'http://rss.sciam.com/sciam/biology'), -# (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), -# (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), -# (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), -# (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), -# (u'Math', u'http://rss.sciam.com/sciam/math'), -# (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), -# (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), -# (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') -# ] -# + match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)'] + def parse_index(self): - src = self.browser.open('http://www.sciam.com/sciammag/').read() - root = html.fromstring(src) - self.cover_url = root.xpath('//img[re:match(@src, "cover_")]', - namespaces={'re':'http://exslt.org/regular-expressions'} - )[0].get('src') - self.timefmt = ' [%s]'%(root.xpath('//div[@id = "magazine-month"]')[0].text) - feeds = [] - features = [] - for a in root.xpath('//a[@href and @title = "Feature"]'): - if not a.text.strip(): - continue + soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/') + month = soup.find(id='magazine-month') + self.timefmt = ' [%s]'%(self.tag_to_string(month)) + img = soup.find('img', alt='Scientific American Magazine', src=True) + if img is not None: + self.cover_url = img['src'] + features, feeds = [], [] + for p in soup.find(id='magazine-info').findAll('p') + \ + soup.find(id='magazine-info-more').findAll('p'): + all_as = p.findAll('a', href=True) + a = all_as[0] + if a is None: continue + desc = '' + for s in p.find('span', attrs={'class':'sub'}): + desc += self.tag_to_string(s) + article = { - 'url' : a.get('href'), - 'title' : u''.join(a.xpath('./text()')), - 'date' : '', - 'description' : '', - } - for s in a.itersiblings('span'): - if s.get('class', '') == 'sub': - article['description'] += u''.join(s.xpath('./text()')) + ' ' + 'url' : a.get('href'), + 'title' : self.tag_to_string(all_as[-1]), + 'date' : '', + 'description' : desc, + } features.append(article) - if features: - feeds.append(('Features', features)) - - departments = [] - for a in root.xpath('//a[@href and @class="title"]'): - txt = u''.join(a.xpath('./text()')).strip() - if not txt: - continue - article = { - 'url' : a.get('href'), - 'title' : txt, - 'date' : '', - 'description' : '', - } - p = a.getparent() - p.remove(a) - article['description'] = u''.join(p.xpath('./text()')) - departments.append(article) - - feeds.append(('Departments', departments)) - opinion = [] - for a in root.xpath('//div[@id = "opinion"]//a[@href]'): - txt = u''.join(a.xpath('./text()')).strip() - if not txt: - continue - article = { - 'url' : a.get('href'), - 'title' : txt, - 'date' : '', - 'description' : '', - } - opinion.append(article) - feeds.append(('Opinion', opinion)) - - ontheweb = [] - for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'): - txt = u''.join(a.xpath('./text()')).strip() - if not txt: - continue - article = { - 'url' : a.get('href'), - 'title' : txt, - 'date' : '', - 'description' : '', - } - ontheweb.append(article) - feeds.append(('On the web', ontheweb)) - + feeds.append(('Features', features)) + + section = [] + found = [] + title = None + for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']): + if x.name == 'div': + if section: + feeds.append((title, section)) + title = self.tag_to_string(x) + section = [] + else: + if title is None or not a.get('href', False) or a.get('href', None) in found: + continue + article = { + 'url' : x['href'], + 'title' : self.tag_to_string(x), + 'date': '', + 'description': '', + } + section.append(article) + if section: + feeds.append((title, section)) + + articles = [] + for a in soup.find(id='opinion').findAll('a', href=True): + articles.append({'url':a['href'], 'title':self.tag_to_string(a), + 'description':'', 'date':''}) + feeds.append(('Opinion', articles)) + return feeds - - + + def postprocess_html(self, soup, first_fetch): if soup is not None: for span in soup.findAll('span', attrs={'class':'pagination'}): span.extract() if not first_fetch: div = soup.find('div', attrs={'class':'headline'}) - if div: + if div: div.extract() return soup diff --git a/src/calibre/web/feeds/recipes/recipe_woz_die.py b/src/calibre/web/feeds/recipes/recipe_woz_die.py new file mode 100644 index 0000000000..730425eb9f --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_woz_die.py @@ -0,0 +1,46 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class WozDie(BasicNewsRecipe): + title = u'WOZ Die Wochenzeitung' + oldest_article = 7 + max_articles_per_feed = 100 + language = _('German') + no_stylesheets = True + remove_tags = [dict(name='p', attrs={'class':'arrow_top'})] + remove_tags = [dict(name='p', attrs={'class':'bottom_right'})] + remove_tags = [dict(name='script')] + extra_css = '''#print_titel{vertical-align: bottom; text-align: + left; color: #666666; background-color: white; padding-top: 30px; padding- + bottom: 10px; border-bottom: 1px solid #999999;} #title{text-align: + left; font-size: large; font-weight: 600; padding-top: 0px; padding- + bottom: 6px;} h3 {text-align: left; font-size: large; font-weight: 600; + padding-top: 0px; padding-bottom: 6px;} #lead{font-weight: 600; + padding-bottom: 6px;} h2{font-weight: 600; padding-bottom: 6px;} + #author{color: #666666; padding-top: 0px; padding-bottom: 0px;} + h4{color: #666666; padding-top: 0px; padding-bottom: 0px;} #author2 + {color: #666666; padding-top: 0px; padding-bottom: 0px;} .dotted_line + {padding-top: 0px; margin-bottom: 18px; border-bottom: 1px dotted + #666666;} .intro{margin: 0 auto; font-weight: 600; padding-bottom: + 18px;} h5{margin: 0 auto; font-weight: 600; padding-bottom: 18px;} + .intro2{margin: 0 auto; font-weight: 600;} .text{padding-bottom: + 18px;} .subtitle{margin: 0 auto; font-weight: 600; padding-bottom: + 10px;} .articletitle{margin: 0 auto; font-weight: 600; padding-bottom: + 10px;} #content_infobox{margin-top: 20px; margin-left: 0px; margin- + right: 0px; margin-bottom: 10px; text-align: left; border-bottom: 1px + solid #999999;} .content_infobox_titel{padding-top: 6px; padding- + bottom: 8px; padding-left: 8px; padding-right: 8px; font-weight: 600; + border-top: 1px solid #999999; border-bottom: 1px dotted #999999;} + .content_infobox_text{padding-top: 6px; padding-bottom: 12px; padding- + left: 8px; padding-right: 8px;} .box_gray{padding-top: 4px; padding- + left: 7px; padding-right: 7px; padding-bottom: 4px;} .box_white { + padding-top: 4px; padding-left: 7px; padding-right: 7px; padding-bottom: + 4px;} .content_infobox_mehr{margin-top: 20px; margin-left: 0px; margin- + right: 0px; margin-bottom: 10px; text-align: left; width: 600px; border- + bottom: 1px solid #999999;}''' + + feeds = [('WOZ Die Wochenzeitung - Headlines', + 'http://www.woz.ch/inhalt/headlinesRSS.php'),] + + def print_version(self, url): + return url.replace('rss/', 'print_') +