From c410bb9ea9803d811f892c01352b7732411f8fb4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 25 May 2009 13:08:55 -0700
Subject: [PATCH] Fix #2497 (Scientific American recipe not working)

---
 src/calibre/web/feeds/recipes/__init__.py     |   2 +-
 .../web/feeds/recipes/recipe_le_temps.py      |  88 ++++++++++
 .../recipes/recipe_scientific_american.py     | 154 +++++++-----------
 .../web/feeds/recipes/recipe_woz_die.py       |  46 ++++++
 4 files changed, 196 insertions(+), 94 deletions(-)
 create mode 100644 src/calibre/web/feeds/recipes/recipe_le_temps.py
 create mode 100644 src/calibre/web/feeds/recipes/recipe_woz_die.py

diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py
index 5f78801493..bd1b5098d9 100644
--- a/src/calibre/web/feeds/recipes/__init__.py
+++ b/src/calibre/web/feeds/recipes/__init__.py
@@ -43,7 +43,7 @@ recipe_modules = ['recipe_' + r for r in (
            'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms',
            'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
            'straitstimes', 'index_hu', 'pcworld_hu', 'hrt', 'rts',
-           'h1', 'h2', 'h3', 'phd_comics',
+           'h1', 'h2', 'h3', 'phd_comics', 'woz_die',
           )]
 
 import re, imp, inspect, time, os
diff --git a/src/calibre/web/feeds/recipes/recipe_le_temps.py b/src/calibre/web/feeds/recipes/recipe_le_temps.py
new file mode 100644
index 0000000000..eb0b6c5a64
--- /dev/null
+++ b/src/calibre/web/feeds/recipes/recipe_le_temps.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LeTemps(BasicNewsRecipe):
+     title          = u'Le Temps'
+     oldest_article = 7
+     max_articles_per_feed = 100
+     no_stylesheets = True
+     remove_tags    = [dict(name='div', attrs={'id':'footer'})]
+     remove_tags    = [dict(name='div', attrs={'class':'box links'})]
+     remove_tags    = [dict(name='script')]
+     extra_css      = '''.heading {font-size: 13px; line-height: 15px;
+ margin: 20px 0;} \n h2 {font-size: 24px; line-height: 25px; margin-bottom:
+ 14px;} \n .author {font-size: 11px; margin: 0 0 5px 0;} \n .lead {font-
+ weight: 700; margin: 10px 0;} \n p {margin: 0 0 10px 0;}'''
+
+     feeds          = [
+                              ('Actualité',
+ 'http://www.letemps.ch/rss/site/'),
+                              ('Monde',
+ 'http://www.letemps.ch/rss/site/actualite/monde'),
+                              ('Suisse & Régions',
+ 'http://www.letemps.ch/rss/site/actualite/suisse_regions'),
+                              ('Sciences & Environnement',
+ 'http://www.letemps.ch/rss/site/actualite/sciences_environnement'),
+                              ('Société',
+ 'http://www.letemps.ch/rss/site/actualite/societe'),
+                              ('Economie & Finance',
+ 'http://www.letemps.ch/rss/site/economie_finance'),
+                              ('Economie & Finance - Finance',
+ 'http://www.letemps.ch/rss/site/economie_finance/finance'),
+                              ('Economie & Finance - Fonds de placement',
+ 'http://www.letemps.ch/rss/site/economie_finance/fonds_placement'),
+                              ('Economie & Finance - Carrières',
+ 'http://www.letemps.ch/rss/site/economie_finance/carrieres'),
+                             ('Culture',
+ 'http://www.letemps.ch/rss/site/culture'),
+                              ('Culture - Cinéma',
+ 'http://www.letemps.ch/rss/site/culture/cinema'),
+                              ('Culture - Musiques',
+ 'http://www.letemps.ch/rss/site/culture/musiques'),
+                              ('Culture - Scènes',
+ 'http://www.letemps.ch/rss/site/culture/scenes'),
+                              ('Culture - Arts plastiques',
+ 'http://www.letemps.ch/rss/site/culture/arts_plastiques'),
+                              ('Livres',
+ 'http://www.letemps.ch/rss/site/culture/livres'),
+                              ('Opinions',
+ 'http://www.letemps.ch/rss/site/opinions'),
+                              ('Opinions - Editoriaux',
+ 'http://www.letemps.ch/rss/site/opinions/editoriaux'),
+                              ('Opinions - Invités',
+ 'http://www.letemps.ch/rss/site/opinions/invites'),
+                              ('Opinions - Chroniques',
+ 'http://www.letemps.ch/rss/site/opinions/chroniques'),
+                              ('LifeStyle',
+ 'http://www.letemps.ch/rss/site/lifestyle'),
+                              ('LifeStyle - Luxe',
+ 'http://www.letemps.ch/rss/site/lifestyle/luxe'),
+                              ('LifeStyle - Horlogerie & Joaillerie',
+ 'http://www.letemps.ch/rss/site/lifestyle/horlogerie_joaillerie'),
+                              ('LifeStyle - Design',
+ 'http://www.letemps.ch/rss/site/lifestyle/design'),
+                              ('LifeStyle - Voyages',
+ 'http://www.letemps.ch/rss/site/lifestyle/voyages'),
+                              ('LifeStyle - Gastronomie',
+ 'http://www.letemps.ch/rss/site/lifestyle/gastronomie'),
+                              ('LifeStyle - Architecture & Immobilier',
+ 'http://www.letemps.ch/rss/site/lifestyle/architecture_immobilier'),
+                              ('LifeStyle - Automobile',
+ 'http://www.letemps.ch/rss/site/lifestyle/automobile'),
+                              ('Sports',
+ 'http://www.letemps.ch/rss/site/actualite/sports'),
+                             ]
+
+     def print_version(self, url):
+            return url.replace('Page', 'Facet/print')
+
+
diff --git a/src/calibre/web/feeds/recipes/recipe_scientific_american.py b/src/calibre/web/feeds/recipes/recipe_scientific_american.py
index e2bce46f2f..8d706e8416 100644
--- a/src/calibre/web/feeds/recipes/recipe_scientific_american.py
+++ b/src/calibre/web/feeds/recipes/recipe_scientific_american.py
@@ -7,124 +7,92 @@ __docformat__ = 'restructuredtext en'
 sciam.com
 '''
 import re
-from lxml import html
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class ScientificAmerican(BasicNewsRecipe):
     title = u'Scientific American'
-    description = u'Popular science. Monthly magazine.' 
+    description = u'Popular science. Monthly magazine.'
     __author__ = 'Kovid Goyal'
     language = _('English')
-    oldest_article = 30 
+    oldest_article = 30
     max_articles_per_feed = 100
     no_stylesheets = True
     use_embedded_content   = False
     remove_tags_before = dict(name='div', attrs={'class':'headline'})
-    remove_tags_after  = dict(id='article')
+    remove_tags_after  = dict(id=['article'])
     remove_tags        = [
-                          dict(id=['sharetools', 'reddit']), 
+                          dict(id=['sharetools', 'reddit']),
                           dict(name='script'),
+                          {'class':['float_left', 'atools']},
                           {"class": re.compile(r'also-in-this')}
                          ]
     html2lrf_options = ['--base-font-size', '8']
     recursions = 1
-    match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
-#    feeds = [
-#             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
-#             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
-#             (u'Health', u'http://rss.sciam.com/sciam/health'), 
-#             (u'Space', u'http://rss.sciam.com/sciam/space'), 
-#             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
-#             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
-#             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
-#             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
-#             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
-#             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
-#             (u'Math', u'http://rss.sciam.com/sciam/math'), 
-#             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
-#             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
-#             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
-#            ]
-#    
+    match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)']
+
     def parse_index(self):
-        src = self.browser.open('http://www.sciam.com/sciammag/').read()
-        root = html.fromstring(src)
-        self.cover_url = root.xpath('//img[re:match(@src, "cover_")]', 
-                                    namespaces={'re':'http://exslt.org/regular-expressions'}
-                                    )[0].get('src')
-        self.timefmt = ' [%s]'%(root.xpath('//div[@id = "magazine-month"]')[0].text)
-        feeds = []
-        features = []
-        for a in root.xpath('//a[@href and @title = "Feature"]'):
-            if not a.text.strip():
-                continue
+        soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
+        month = soup.find(id='magazine-month')
+        self.timefmt = ' [%s]'%(self.tag_to_string(month))
+        img = soup.find('img', alt='Scientific American Magazine', src=True)
+        if img is not None:
+            self.cover_url = img['src']
+        features, feeds = [], []
+        for p in soup.find(id='magazine-info').findAll('p') + \
+                soup.find(id='magazine-info-more').findAll('p'):
+            all_as = p.findAll('a', href=True)
+            a = all_as[0]
+            if a is None: continue
+            desc = ''
+            for s in p.find('span', attrs={'class':'sub'}):
+                desc += self.tag_to_string(s)
+
             article = {
-                       'url'    : a.get('href'),
-                       'title'  : u''.join(a.xpath('./text()')),
-                       'date'   : '',
-                       'description' : '',   
-                       }
-            for s in a.itersiblings('span'):
-                if s.get('class', '') == 'sub':
-                    article['description'] += u''.join(s.xpath('./text()')) + ' '
+                    'url' : a.get('href'),
+                    'title' : self.tag_to_string(all_as[-1]),
+                    'date' : '',
+                    'description' : desc,
+                    }
             features.append(article)
-        if features:
-            feeds.append(('Features', features))
-            
-        departments = []
-        for a in root.xpath('//a[@href and @class="title"]'):
-            txt = u''.join(a.xpath('./text()')).strip()
-            if not txt:
-                continue
-            article = {
-                       'url'    : a.get('href'),
-                       'title'  : txt,
-                       'date'   : '',
-                       'description' : '',   
-                       }
-            p = a.getparent()
-            p.remove(a)
-            article['description'] = u''.join(p.xpath('./text()'))
-            departments.append(article)
-            
-        feeds.append(('Departments', departments))
-        opinion = []
-        for a in root.xpath('//div[@id = "opinion"]//a[@href]'):
-            txt = u''.join(a.xpath('./text()')).strip()
-            if not txt:
-                continue
-            article = {
-                       'url'    : a.get('href'),
-                       'title'  : txt,
-                       'date'   : '',
-                       'description' : '',   
-                       }
-            opinion.append(article)
-        feeds.append(('Opinion', opinion))
-        
-        ontheweb = []
-        for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'):
-            txt = u''.join(a.xpath('./text()')).strip()
-            if not txt:
-                continue
-            article = {
-                       'url'    : a.get('href'),
-                       'title'  : txt,
-                       'date'   : '',
-                       'description' : '',   
-                       }
-            ontheweb.append(article)
-        feeds.append(('On the web', ontheweb))
-        
+        feeds.append(('Features', features))
+
+        section = []
+        found = []
+        title = None
+        for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']):
+            if x.name == 'div':
+                if section:
+                    feeds.append((title, section))
+                title = self.tag_to_string(x)
+                section = []
+            else:
+                if title is None or not a.get('href', False) or a.get('href', None) in found:
+                    continue
+                article = {
+                        'url' : x['href'],
+                        'title' : self.tag_to_string(x),
+                        'date': '',
+                        'description': '',
+                        }
+                section.append(article)
+        if section:
+            feeds.append((title, section))
+
+        articles = []
+        for a in soup.find(id='opinion').findAll('a', href=True):
+            articles.append({'url':a['href'], 'title':self.tag_to_string(a),
+                'description':'', 'date':''})
+        feeds.append(('Opinion', articles))
+
         return feeds
-        
-    
+
+
     def postprocess_html(self, soup, first_fetch):
         if soup is not None:
             for span in soup.findAll('span', attrs={'class':'pagination'}):
                 span.extract()
             if not first_fetch:
                 div = soup.find('div', attrs={'class':'headline'})
-                if div: 
+                if div:
                     div.extract()
         return soup
diff --git a/src/calibre/web/feeds/recipes/recipe_woz_die.py b/src/calibre/web/feeds/recipes/recipe_woz_die.py
new file mode 100644
index 0000000000..730425eb9f
--- /dev/null
+++ b/src/calibre/web/feeds/recipes/recipe_woz_die.py
@@ -0,0 +1,46 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class WozDie(BasicNewsRecipe):
+     title          = u'WOZ Die Wochenzeitung'
+     oldest_article = 7
+     max_articles_per_feed = 100
+     language       = _('German')
+     no_stylesheets = True
+     remove_tags    = [dict(name='p', attrs={'class':'arrow_top'})]
+     remove_tags    = [dict(name='p', attrs={'class':'bottom_right'})]
+     remove_tags    = [dict(name='script')]
+     extra_css      = '''#print_titel{vertical-align: bottom; text-align:
+ left; color: #666666; background-color: white; padding-top: 30px; padding-
+ bottom: 10px; border-bottom: 1px solid #999999;} #title{text-align:
+ left; font-size: large; font-weight: 600; padding-top: 0px; padding-
+ bottom: 6px;}  h3 {text-align: left; font-size: large; font-weight: 600;
+ padding-top: 0px; padding-bottom: 6px;}  #lead{font-weight: 600;
+ padding-bottom: 6px;}  h2{font-weight: 600; padding-bottom: 6px;}
+ #author{color: #666666; padding-top: 0px; padding-bottom: 0px;}
+ h4{color: #666666; padding-top: 0px; padding-bottom: 0px;}  #author2
+ {color: #666666; padding-top: 0px; padding-bottom: 0px;}  .dotted_line
+ {padding-top: 0px; margin-bottom: 18px; border-bottom: 1px dotted
+ #666666;}  .intro{margin: 0 auto; font-weight: 600; padding-bottom:
+ 18px;}  h5{margin: 0 auto; font-weight: 600; padding-bottom: 18px;}
+ .intro2{margin: 0 auto;  font-weight: 600;}  .text{padding-bottom:
+ 18px;}  .subtitle{margin: 0 auto; font-weight: 600; padding-bottom:
+ 10px;}  .articletitle{margin: 0 auto; font-weight: 600; padding-bottom:
+ 10px;}  #content_infobox{margin-top: 20px; margin-left: 0px; margin-
+ right: 0px; margin-bottom: 10px; text-align: left; border-bottom: 1px
+ solid #999999;}  .content_infobox_titel{padding-top: 6px; padding-
+ bottom: 8px; padding-left: 8px; padding-right: 8px; font-weight: 600;
+ border-top: 1px solid #999999; border-bottom: 1px dotted #999999;}
+ .content_infobox_text{padding-top: 6px; padding-bottom: 12px; padding-
+ left: 8px; padding-right: 8px;}  .box_gray{padding-top: 4px; padding-
+ left:  7px; padding-right:  7px; padding-bottom:  4px;}  .box_white {
+ padding-top: 4px; padding-left:  7px; padding-right:  7px; padding-bottom:
+ 4px;}  .content_infobox_mehr{margin-top: 20px; margin-left: 0px; margin-
+ right: 0px; margin-bottom: 10px; text-align: left; width: 600px; border-
+ bottom: 1px solid #999999;}'''
+
+     feeds          = [('WOZ Die Wochenzeitung - Headlines',
+ 'http://www.woz.ch/inhalt/headlinesRSS.php'),]
+
+     def print_version(self, url):
+            return url.replace('rss/', 'print_')
+