Fix #2497 (Scientific American recipe not working)

2025-11-25 07:45:01 -05:00 · 2009-05-25 13:08:55 -07:00 · 2009-05-25 13:08:55 -07:00 · c410bb9ea9
commit c410bb9ea9
parent 3f40befc6d
4 changed files with 196 additions and 94 deletions
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -43,7 +43,7 @@ recipe_modules = ['recipe_' + r for r in (
           'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms',
           'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
           'straitstimes', 'index_hu', 'pcworld_hu', 'hrt', 'rts',
-           'h1', 'h2', 'h3', 'phd_comics',
+           'h1', 'h2', 'h3', 'phd_comics', 'woz_die',
          )]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_le_temps.py
+++ b/src/calibre/web/feeds/recipes/recipe_le_temps.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LeTemps(BasicNewsRecipe):
+     title          = u'Le Temps'
+     oldest_article = 7
+     max_articles_per_feed = 100
+     no_stylesheets = True
+     remove_tags    = [dict(name='div', attrs={'id':'footer'})]
+     remove_tags    = [dict(name='div', attrs={'class':'box links'})]
+     remove_tags    = [dict(name='script')]
+     extra_css      = '''.heading {font-size: 13px; line-height: 15px;
+ margin: 20px 0;} \n h2 {font-size: 24px; line-height: 25px; margin-bottom:
+ 14px;} \n .author {font-size: 11px; margin: 0 0 5px 0;} \n .lead {font-
+ weight: 700; margin: 10px 0;} \n p {margin: 0 0 10px 0;}'''
+
+     feeds          = [
+                              ('Actualité',
+ 'http://www.letemps.ch/rss/site/'),
+                              ('Monde',
+ 'http://www.letemps.ch/rss/site/actualite/monde'),
+                              ('Suisse & Régions',
+ 'http://www.letemps.ch/rss/site/actualite/suisse_regions'),
+                              ('Sciences & Environnement',
+ 'http://www.letemps.ch/rss/site/actualite/sciences_environnement'),
+                              ('Société',
+ 'http://www.letemps.ch/rss/site/actualite/societe'),
+                              ('Economie & Finance',
+ 'http://www.letemps.ch/rss/site/economie_finance'),
+                              ('Economie & Finance - Finance',
+ 'http://www.letemps.ch/rss/site/economie_finance/finance'),
+                              ('Economie & Finance - Fonds de placement',
+ 'http://www.letemps.ch/rss/site/economie_finance/fonds_placement'),
+                              ('Economie & Finance - Carrières',
+ 'http://www.letemps.ch/rss/site/economie_finance/carrieres'),
+                             ('Culture',
+ 'http://www.letemps.ch/rss/site/culture'),
+                              ('Culture - Cinéma',
+ 'http://www.letemps.ch/rss/site/culture/cinema'),
+                              ('Culture - Musiques',
+ 'http://www.letemps.ch/rss/site/culture/musiques'),
+                              ('Culture - Scènes',
+ 'http://www.letemps.ch/rss/site/culture/scenes'),
+                              ('Culture - Arts plastiques',
+ 'http://www.letemps.ch/rss/site/culture/arts_plastiques'),
+                              ('Livres',
+ 'http://www.letemps.ch/rss/site/culture/livres'),
+                              ('Opinions',
+ 'http://www.letemps.ch/rss/site/opinions'),
+                              ('Opinions - Editoriaux',
+ 'http://www.letemps.ch/rss/site/opinions/editoriaux'),
+                              ('Opinions - Invités',
+ 'http://www.letemps.ch/rss/site/opinions/invites'),
+                              ('Opinions - Chroniques',
+ 'http://www.letemps.ch/rss/site/opinions/chroniques'),
+                              ('LifeStyle',
+ 'http://www.letemps.ch/rss/site/lifestyle'),
+                              ('LifeStyle - Luxe',
+ 'http://www.letemps.ch/rss/site/lifestyle/luxe'),
+                              ('LifeStyle - Horlogerie & Joaillerie',
+ 'http://www.letemps.ch/rss/site/lifestyle/horlogerie_joaillerie'),
+                              ('LifeStyle - Design',
+ 'http://www.letemps.ch/rss/site/lifestyle/design'),
+                              ('LifeStyle - Voyages',
+ 'http://www.letemps.ch/rss/site/lifestyle/voyages'),
+                              ('LifeStyle - Gastronomie',
+ 'http://www.letemps.ch/rss/site/lifestyle/gastronomie'),
+                              ('LifeStyle - Architecture & Immobilier',
+ 'http://www.letemps.ch/rss/site/lifestyle/architecture_immobilier'),
+                              ('LifeStyle - Automobile',
+ 'http://www.letemps.ch/rss/site/lifestyle/automobile'),
+                              ('Sports',
+ 'http://www.letemps.ch/rss/site/actualite/sports'),
+                             ]
+
+     def print_version(self, url):
+            return url.replace('Page', 'Facet/print')
+
+
--- a/src/calibre/web/feeds/recipes/recipe_scientific_american.py
+++ b/src/calibre/web/feeds/recipes/recipe_scientific_american.py
@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
 sciam.com
 '''
 import re
-from lxml import html
 from calibre.web.feeds.news import BasicNewsRecipe

 class ScientificAmerican(BasicNewsRecipe):
@ -20,101 +19,70 @@ class ScientificAmerican(BasicNewsRecipe):
    no_stylesheets = True
    use_embedded_content   = False
    remove_tags_before = dict(name='div', attrs={'class':'headline'})
-    remove_tags_after  = dict(id='article')
+    remove_tags_after  = dict(id=['article'])
    remove_tags        = [
                          dict(id=['sharetools', 'reddit']),
                          dict(name='script'),
+                          {'class':['float_left', 'atools']},
                          {"class": re.compile(r'also-in-this')}
                         ]
    html2lrf_options = ['--base-font-size', '8']
    recursions = 1
-    match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
-#    feeds = [
-#             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
-#             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
-#             (u'Health', u'http://rss.sciam.com/sciam/health'), 
-#             (u'Space', u'http://rss.sciam.com/sciam/space'), 
-#             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
-#             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
-#             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
-#             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
-#             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
-#             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
-#             (u'Math', u'http://rss.sciam.com/sciam/math'), 
-#             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
-#             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
-#             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
-#            ]
-#    
+    match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)']
+
    def parse_index(self):
-        src = self.browser.open('http://www.sciam.com/sciammag/').read()
-        root = html.fromstring(src)
-        self.cover_url = root.xpath('//img[re:match(@src, "cover_")]', 
-                                    namespaces={'re':'http://exslt.org/regular-expressions'}
-                                    )[0].get('src')
-        self.timefmt = ' [%s]'%(root.xpath('//div[@id = "magazine-month"]')[0].text)
-        feeds = []
-        features = []
-        for a in root.xpath('//a[@href and @title = "Feature"]'):
-            if not a.text.strip():
-                continue
+        soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
+        month = soup.find(id='magazine-month')
+        self.timefmt = ' [%s]'%(self.tag_to_string(month))
+        img = soup.find('img', alt='Scientific American Magazine', src=True)
+        if img is not None:
+            self.cover_url = img['src']
+        features, feeds = [], []
+        for p in soup.find(id='magazine-info').findAll('p') + \
+                soup.find(id='magazine-info-more').findAll('p'):
+            all_as = p.findAll('a', href=True)
+            a = all_as[0]
+            if a is None: continue
+            desc = ''
+            for s in p.find('span', attrs={'class':'sub'}):
+                desc += self.tag_to_string(s)
+
            article = {
                    'url' : a.get('href'),
-                       'title'  : u''.join(a.xpath('./text()')),
+                    'title' : self.tag_to_string(all_as[-1]),
                    'date' : '',
-                       'description' : '',   
+                    'description' : desc,
                    }
-            for s in a.itersiblings('span'):
-                if s.get('class', '') == 'sub':
-                    article['description'] += u''.join(s.xpath('./text()')) + ' '
            features.append(article)
-        if features:
        feeds.append(('Features', features))

-        departments = []
-        for a in root.xpath('//a[@href and @class="title"]'):
-            txt = u''.join(a.xpath('./text()')).strip()
-            if not txt:
+        section = []
+        found = []
+        title = None
+        for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']):
+            if x.name == 'div':
+                if section:
+                    feeds.append((title, section))
+                title = self.tag_to_string(x)
+                section = []
+            else:
+                if title is None or not a.get('href', False) or a.get('href', None) in found:
                    continue
                article = {
-                       'url'    : a.get('href'),
-                       'title'  : txt,
+                        'url' : x['href'],
+                        'title' : self.tag_to_string(x),
                        'date': '',
                        'description': '',
                        }
-            p = a.getparent()
-            p.remove(a)
-            article['description'] = u''.join(p.xpath('./text()'))
-            departments.append(article)
+                section.append(article)
+        if section:
+            feeds.append((title, section))

-        feeds.append(('Departments', departments))
-        opinion = []
-        for a in root.xpath('//div[@id = "opinion"]//a[@href]'):
-            txt = u''.join(a.xpath('./text()')).strip()
-            if not txt:
-                continue
-            article = {
-                       'url'    : a.get('href'),
-                       'title'  : txt,
-                       'date'   : '',
-                       'description' : '',   
-                       }
-            opinion.append(article)
-        feeds.append(('Opinion', opinion))
-        
-        ontheweb = []
-        for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'):
-            txt = u''.join(a.xpath('./text()')).strip()
-            if not txt:
-                continue
-            article = {
-                       'url'    : a.get('href'),
-                       'title'  : txt,
-                       'date'   : '',
-                       'description' : '',   
-                       }
-            ontheweb.append(article)
-        feeds.append(('On the web', ontheweb))
+        articles = []
+        for a in soup.find(id='opinion').findAll('a', href=True):
+            articles.append({'url':a['href'], 'title':self.tag_to_string(a),
+                'description':'', 'date':''})
+        feeds.append(('Opinion', articles))

        return feeds

--- a/src/calibre/web/feeds/recipes/recipe_woz_die.py
+++ b/src/calibre/web/feeds/recipes/recipe_woz_die.py
@ -0,0 +1,46 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class WozDie(BasicNewsRecipe):
+     title          = u'WOZ Die Wochenzeitung'
+     oldest_article = 7
+     max_articles_per_feed = 100
+     language       = _('German')
+     no_stylesheets = True
+     remove_tags    = [dict(name='p', attrs={'class':'arrow_top'})]
+     remove_tags    = [dict(name='p', attrs={'class':'bottom_right'})]
+     remove_tags    = [dict(name='script')]
+     extra_css      = '''#print_titel{vertical-align: bottom; text-align:
+ left; color: #666666; background-color: white; padding-top: 30px; padding-
+ bottom: 10px; border-bottom: 1px solid #999999;} #title{text-align:
+ left; font-size: large; font-weight: 600; padding-top: 0px; padding-
+ bottom: 6px;}  h3 {text-align: left; font-size: large; font-weight: 600;
+ padding-top: 0px; padding-bottom: 6px;}  #lead{font-weight: 600;
+ padding-bottom: 6px;}  h2{font-weight: 600; padding-bottom: 6px;}
+ #author{color: #666666; padding-top: 0px; padding-bottom: 0px;}
+ h4{color: #666666; padding-top: 0px; padding-bottom: 0px;}  #author2
+ {color: #666666; padding-top: 0px; padding-bottom: 0px;}  .dotted_line
+ {padding-top: 0px; margin-bottom: 18px; border-bottom: 1px dotted
+ #666666;}  .intro{margin: 0 auto; font-weight: 600; padding-bottom:
+ 18px;}  h5{margin: 0 auto; font-weight: 600; padding-bottom: 18px;}
+ .intro2{margin: 0 auto;  font-weight: 600;}  .text{padding-bottom:
+ 18px;}  .subtitle{margin: 0 auto; font-weight: 600; padding-bottom:
+ 10px;}  .articletitle{margin: 0 auto; font-weight: 600; padding-bottom:
+ 10px;}  #content_infobox{margin-top: 20px; margin-left: 0px; margin-
+ right: 0px; margin-bottom: 10px; text-align: left; border-bottom: 1px
+ solid #999999;}  .content_infobox_titel{padding-top: 6px; padding-
+ bottom: 8px; padding-left: 8px; padding-right: 8px; font-weight: 600;
+ border-top: 1px solid #999999; border-bottom: 1px dotted #999999;}
+ .content_infobox_text{padding-top: 6px; padding-bottom: 12px; padding-
+ left: 8px; padding-right: 8px;}  .box_gray{padding-top: 4px; padding-
+ left:  7px; padding-right:  7px; padding-bottom:  4px;}  .box_white {
+ padding-top: 4px; padding-left:  7px; padding-right:  7px; padding-bottom:
+ 4px;}  .content_infobox_mehr{margin-top: 20px; margin-left: 0px; margin-
+ right: 0px; margin-bottom: 10px; text-align: left; width: 600px; border-
+ bottom: 1px solid #999999;}'''
+
+     feeds          = [('WOZ Die Wochenzeitung - Headlines',
+ 'http://www.woz.ch/inhalt/headlinesRSS.php'),]
+
+     def print_version(self, url):
+            return url.replace('rss/', 'print_')
+