Fix #2497 (Scientific American recipe not working)

This commit is contained in:
Kovid Goyal 2009-05-25 13:08:55 -07:00
parent 3f40befc6d
commit c410bb9ea9
4 changed files with 196 additions and 94 deletions

View File

@ -43,7 +43,7 @@ recipe_modules = ['recipe_' + r for r in (
'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms',
'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
'straitstimes', 'index_hu', 'pcworld_hu', 'hrt', 'rts',
'h1', 'h2', 'h3', 'phd_comics',
'h1', 'h2', 'h3', 'phd_comics', 'woz_die',
)]
import re, imp, inspect, time, os

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class LeTemps(BasicNewsRecipe):
title = u'Le Temps'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'id':'footer'})]
remove_tags = [dict(name='div', attrs={'class':'box links'})]
remove_tags = [dict(name='script')]
extra_css = '''.heading {font-size: 13px; line-height: 15px;
margin: 20px 0;} \n h2 {font-size: 24px; line-height: 25px; margin-bottom:
14px;} \n .author {font-size: 11px; margin: 0 0 5px 0;} \n .lead {font-
weight: 700; margin: 10px 0;} \n p {margin: 0 0 10px 0;}'''
feeds = [
('Actualité',
'http://www.letemps.ch/rss/site/'),
('Monde',
'http://www.letemps.ch/rss/site/actualite/monde'),
('Suisse & Régions',
'http://www.letemps.ch/rss/site/actualite/suisse_regions'),
('Sciences & Environnement',
'http://www.letemps.ch/rss/site/actualite/sciences_environnement'),
('Société',
'http://www.letemps.ch/rss/site/actualite/societe'),
('Economie & Finance',
'http://www.letemps.ch/rss/site/economie_finance'),
('Economie & Finance - Finance',
'http://www.letemps.ch/rss/site/economie_finance/finance'),
('Economie & Finance - Fonds de placement',
'http://www.letemps.ch/rss/site/economie_finance/fonds_placement'),
('Economie & Finance - Carrières',
'http://www.letemps.ch/rss/site/economie_finance/carrieres'),
('Culture',
'http://www.letemps.ch/rss/site/culture'),
('Culture - Cinéma',
'http://www.letemps.ch/rss/site/culture/cinema'),
('Culture - Musiques',
'http://www.letemps.ch/rss/site/culture/musiques'),
('Culture - Scènes',
'http://www.letemps.ch/rss/site/culture/scenes'),
('Culture - Arts plastiques',
'http://www.letemps.ch/rss/site/culture/arts_plastiques'),
('Livres',
'http://www.letemps.ch/rss/site/culture/livres'),
('Opinions',
'http://www.letemps.ch/rss/site/opinions'),
('Opinions - Editoriaux',
'http://www.letemps.ch/rss/site/opinions/editoriaux'),
('Opinions - Invités',
'http://www.letemps.ch/rss/site/opinions/invites'),
('Opinions - Chroniques',
'http://www.letemps.ch/rss/site/opinions/chroniques'),
('LifeStyle',
'http://www.letemps.ch/rss/site/lifestyle'),
('LifeStyle - Luxe',
'http://www.letemps.ch/rss/site/lifestyle/luxe'),
('LifeStyle - Horlogerie & Joaillerie',
'http://www.letemps.ch/rss/site/lifestyle/horlogerie_joaillerie'),
('LifeStyle - Design',
'http://www.letemps.ch/rss/site/lifestyle/design'),
('LifeStyle - Voyages',
'http://www.letemps.ch/rss/site/lifestyle/voyages'),
('LifeStyle - Gastronomie',
'http://www.letemps.ch/rss/site/lifestyle/gastronomie'),
('LifeStyle - Architecture & Immobilier',
'http://www.letemps.ch/rss/site/lifestyle/architecture_immobilier'),
('LifeStyle - Automobile',
'http://www.letemps.ch/rss/site/lifestyle/automobile'),
('Sports',
'http://www.letemps.ch/rss/site/actualite/sports'),
]
def print_version(self, url):
return url.replace('Page', 'Facet/print')

View File

@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
sciam.com
'''
import re
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
class ScientificAmerican(BasicNewsRecipe):
@ -20,101 +19,70 @@ class ScientificAmerican(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
remove_tags_before = dict(name='div', attrs={'class':'headline'})
remove_tags_after = dict(id='article')
remove_tags_after = dict(id=['article'])
remove_tags = [
dict(id=['sharetools', 'reddit']),
dict(name='script'),
{'class':['float_left', 'atools']},
{"class": re.compile(r'also-in-this')}
]
html2lrf_options = ['--base-font-size', '8']
recursions = 1
match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
# feeds = [
# (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'),
# (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'),
# (u'Health', u'http://rss.sciam.com/sciam/health'),
# (u'Space', u'http://rss.sciam.com/sciam/space'),
# (u'Technology', u'http://rss.sciam.com/sciam/technology'),
# (u'Biology', u'http://rss.sciam.com/sciam/biology'),
# (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'),
# (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'),
# (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'),
# (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'),
# (u'Math', u'http://rss.sciam.com/sciam/math'),
# (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'),
# (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'),
# (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
# ]
#
match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)']
def parse_index(self):
src = self.browser.open('http://www.sciam.com/sciammag/').read()
root = html.fromstring(src)
self.cover_url = root.xpath('//img[re:match(@src, "cover_")]',
namespaces={'re':'http://exslt.org/regular-expressions'}
)[0].get('src')
self.timefmt = ' [%s]'%(root.xpath('//div[@id = "magazine-month"]')[0].text)
feeds = []
features = []
for a in root.xpath('//a[@href and @title = "Feature"]'):
if not a.text.strip():
continue
soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
month = soup.find(id='magazine-month')
self.timefmt = ' [%s]'%(self.tag_to_string(month))
img = soup.find('img', alt='Scientific American Magazine', src=True)
if img is not None:
self.cover_url = img['src']
features, feeds = [], []
for p in soup.find(id='magazine-info').findAll('p') + \
soup.find(id='magazine-info-more').findAll('p'):
all_as = p.findAll('a', href=True)
a = all_as[0]
if a is None: continue
desc = ''
for s in p.find('span', attrs={'class':'sub'}):
desc += self.tag_to_string(s)
article = {
'url' : a.get('href'),
'title' : u''.join(a.xpath('./text()')),
'title' : self.tag_to_string(all_as[-1]),
'date' : '',
'description' : '',
'description' : desc,
}
for s in a.itersiblings('span'):
if s.get('class', '') == 'sub':
article['description'] += u''.join(s.xpath('./text()')) + ' '
features.append(article)
if features:
feeds.append(('Features', features))
departments = []
for a in root.xpath('//a[@href and @class="title"]'):
txt = u''.join(a.xpath('./text()')).strip()
if not txt:
section = []
found = []
title = None
for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']):
if x.name == 'div':
if section:
feeds.append((title, section))
title = self.tag_to_string(x)
section = []
else:
if title is None or not a.get('href', False) or a.get('href', None) in found:
continue
article = {
'url' : a.get('href'),
'title' : txt,
'url' : x['href'],
'title' : self.tag_to_string(x),
'date': '',
'description': '',
}
p = a.getparent()
p.remove(a)
article['description'] = u''.join(p.xpath('./text()'))
departments.append(article)
section.append(article)
if section:
feeds.append((title, section))
feeds.append(('Departments', departments))
opinion = []
for a in root.xpath('//div[@id = "opinion"]//a[@href]'):
txt = u''.join(a.xpath('./text()')).strip()
if not txt:
continue
article = {
'url' : a.get('href'),
'title' : txt,
'date' : '',
'description' : '',
}
opinion.append(article)
feeds.append(('Opinion', opinion))
ontheweb = []
for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'):
txt = u''.join(a.xpath('./text()')).strip()
if not txt:
continue
article = {
'url' : a.get('href'),
'title' : txt,
'date' : '',
'description' : '',
}
ontheweb.append(article)
feeds.append(('On the web', ontheweb))
articles = []
for a in soup.find(id='opinion').findAll('a', href=True):
articles.append({'url':a['href'], 'title':self.tag_to_string(a),
'description':'', 'date':''})
feeds.append(('Opinion', articles))
return feeds

View File

@ -0,0 +1,46 @@
from calibre.web.feeds.news import BasicNewsRecipe
class WozDie(BasicNewsRecipe):
title = u'WOZ Die Wochenzeitung'
oldest_article = 7
max_articles_per_feed = 100
language = _('German')
no_stylesheets = True
remove_tags = [dict(name='p', attrs={'class':'arrow_top'})]
remove_tags = [dict(name='p', attrs={'class':'bottom_right'})]
remove_tags = [dict(name='script')]
extra_css = '''#print_titel{vertical-align: bottom; text-align:
left; color: #666666; background-color: white; padding-top: 30px; padding-
bottom: 10px; border-bottom: 1px solid #999999;} #title{text-align:
left; font-size: large; font-weight: 600; padding-top: 0px; padding-
bottom: 6px;} h3 {text-align: left; font-size: large; font-weight: 600;
padding-top: 0px; padding-bottom: 6px;} #lead{font-weight: 600;
padding-bottom: 6px;} h2{font-weight: 600; padding-bottom: 6px;}
#author{color: #666666; padding-top: 0px; padding-bottom: 0px;}
h4{color: #666666; padding-top: 0px; padding-bottom: 0px;} #author2
{color: #666666; padding-top: 0px; padding-bottom: 0px;} .dotted_line
{padding-top: 0px; margin-bottom: 18px; border-bottom: 1px dotted
#666666;} .intro{margin: 0 auto; font-weight: 600; padding-bottom:
18px;} h5{margin: 0 auto; font-weight: 600; padding-bottom: 18px;}
.intro2{margin: 0 auto; font-weight: 600;} .text{padding-bottom:
18px;} .subtitle{margin: 0 auto; font-weight: 600; padding-bottom:
10px;} .articletitle{margin: 0 auto; font-weight: 600; padding-bottom:
10px;} #content_infobox{margin-top: 20px; margin-left: 0px; margin-
right: 0px; margin-bottom: 10px; text-align: left; border-bottom: 1px
solid #999999;} .content_infobox_titel{padding-top: 6px; padding-
bottom: 8px; padding-left: 8px; padding-right: 8px; font-weight: 600;
border-top: 1px solid #999999; border-bottom: 1px dotted #999999;}
.content_infobox_text{padding-top: 6px; padding-bottom: 12px; padding-
left: 8px; padding-right: 8px;} .box_gray{padding-top: 4px; padding-
left: 7px; padding-right: 7px; padding-bottom: 4px;} .box_white {
padding-top: 4px; padding-left: 7px; padding-right: 7px; padding-bottom:
4px;} .content_infobox_mehr{margin-top: 20px; margin-left: 0px; margin-
right: 0px; margin-bottom: 10px; text-align: left; width: 600px; border-
bottom: 1px solid #999999;}'''
feeds = [('WOZ Die Wochenzeitung - Headlines',
'http://www.woz.ch/inhalt/headlinesRSS.php'),]
def print_version(self, url):
return url.replace('rss/', 'print_')