mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Fix #2497 (Scientific American recipe not working)
This commit is contained in:
parent
3f40befc6d
commit
c410bb9ea9
@ -43,7 +43,7 @@ recipe_modules = ['recipe_' + r for r in (
|
|||||||
'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms',
|
'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms',
|
||||||
'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
|
'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
|
||||||
'straitstimes', 'index_hu', 'pcworld_hu', 'hrt', 'rts',
|
'straitstimes', 'index_hu', 'pcworld_hu', 'hrt', 'rts',
|
||||||
'h1', 'h2', 'h3', 'phd_comics',
|
'h1', 'h2', 'h3', 'phd_comics', 'woz_die',
|
||||||
)]
|
)]
|
||||||
|
|
||||||
import re, imp, inspect, time, os
|
import re, imp, inspect, time, os
|
||||||
|
88
src/calibre/web/feeds/recipes/recipe_le_temps.py
Normal file
88
src/calibre/web/feeds/recipes/recipe_le_temps.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class LeTemps(BasicNewsRecipe):
|
||||||
|
title = u'Le Temps'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_tags = [dict(name='div', attrs={'id':'footer'})]
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':'box links'})]
|
||||||
|
remove_tags = [dict(name='script')]
|
||||||
|
extra_css = '''.heading {font-size: 13px; line-height: 15px;
|
||||||
|
margin: 20px 0;} \n h2 {font-size: 24px; line-height: 25px; margin-bottom:
|
||||||
|
14px;} \n .author {font-size: 11px; margin: 0 0 5px 0;} \n .lead {font-
|
||||||
|
weight: 700; margin: 10px 0;} \n p {margin: 0 0 10px 0;}'''
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Actualité',
|
||||||
|
'http://www.letemps.ch/rss/site/'),
|
||||||
|
('Monde',
|
||||||
|
'http://www.letemps.ch/rss/site/actualite/monde'),
|
||||||
|
('Suisse & Régions',
|
||||||
|
'http://www.letemps.ch/rss/site/actualite/suisse_regions'),
|
||||||
|
('Sciences & Environnement',
|
||||||
|
'http://www.letemps.ch/rss/site/actualite/sciences_environnement'),
|
||||||
|
('Société',
|
||||||
|
'http://www.letemps.ch/rss/site/actualite/societe'),
|
||||||
|
('Economie & Finance',
|
||||||
|
'http://www.letemps.ch/rss/site/economie_finance'),
|
||||||
|
('Economie & Finance - Finance',
|
||||||
|
'http://www.letemps.ch/rss/site/economie_finance/finance'),
|
||||||
|
('Economie & Finance - Fonds de placement',
|
||||||
|
'http://www.letemps.ch/rss/site/economie_finance/fonds_placement'),
|
||||||
|
('Economie & Finance - Carrières',
|
||||||
|
'http://www.letemps.ch/rss/site/economie_finance/carrieres'),
|
||||||
|
('Culture',
|
||||||
|
'http://www.letemps.ch/rss/site/culture'),
|
||||||
|
('Culture - Cinéma',
|
||||||
|
'http://www.letemps.ch/rss/site/culture/cinema'),
|
||||||
|
('Culture - Musiques',
|
||||||
|
'http://www.letemps.ch/rss/site/culture/musiques'),
|
||||||
|
('Culture - Scènes',
|
||||||
|
'http://www.letemps.ch/rss/site/culture/scenes'),
|
||||||
|
('Culture - Arts plastiques',
|
||||||
|
'http://www.letemps.ch/rss/site/culture/arts_plastiques'),
|
||||||
|
('Livres',
|
||||||
|
'http://www.letemps.ch/rss/site/culture/livres'),
|
||||||
|
('Opinions',
|
||||||
|
'http://www.letemps.ch/rss/site/opinions'),
|
||||||
|
('Opinions - Editoriaux',
|
||||||
|
'http://www.letemps.ch/rss/site/opinions/editoriaux'),
|
||||||
|
('Opinions - Invités',
|
||||||
|
'http://www.letemps.ch/rss/site/opinions/invites'),
|
||||||
|
('Opinions - Chroniques',
|
||||||
|
'http://www.letemps.ch/rss/site/opinions/chroniques'),
|
||||||
|
('LifeStyle',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle'),
|
||||||
|
('LifeStyle - Luxe',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle/luxe'),
|
||||||
|
('LifeStyle - Horlogerie & Joaillerie',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle/horlogerie_joaillerie'),
|
||||||
|
('LifeStyle - Design',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle/design'),
|
||||||
|
('LifeStyle - Voyages',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle/voyages'),
|
||||||
|
('LifeStyle - Gastronomie',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle/gastronomie'),
|
||||||
|
('LifeStyle - Architecture & Immobilier',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle/architecture_immobilier'),
|
||||||
|
('LifeStyle - Automobile',
|
||||||
|
'http://www.letemps.ch/rss/site/lifestyle/automobile'),
|
||||||
|
('Sports',
|
||||||
|
'http://www.letemps.ch/rss/site/actualite/sports'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('Page', 'Facet/print')
|
||||||
|
|
||||||
|
|
@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
sciam.com
|
sciam.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
from lxml import html
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class ScientificAmerican(BasicNewsRecipe):
|
class ScientificAmerican(BasicNewsRecipe):
|
||||||
@ -20,101 +19,70 @@ class ScientificAmerican(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_tags_before = dict(name='div', attrs={'class':'headline'})
|
remove_tags_before = dict(name='div', attrs={'class':'headline'})
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags_after = dict(id=['article'])
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(id=['sharetools', 'reddit']),
|
dict(id=['sharetools', 'reddit']),
|
||||||
dict(name='script'),
|
dict(name='script'),
|
||||||
|
{'class':['float_left', 'atools']},
|
||||||
{"class": re.compile(r'also-in-this')}
|
{"class": re.compile(r'also-in-this')}
|
||||||
]
|
]
|
||||||
html2lrf_options = ['--base-font-size', '8']
|
html2lrf_options = ['--base-font-size', '8']
|
||||||
recursions = 1
|
recursions = 1
|
||||||
match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
|
match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)']
|
||||||
# feeds = [
|
|
||||||
# (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'),
|
|
||||||
# (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'),
|
|
||||||
# (u'Health', u'http://rss.sciam.com/sciam/health'),
|
|
||||||
# (u'Space', u'http://rss.sciam.com/sciam/space'),
|
|
||||||
# (u'Technology', u'http://rss.sciam.com/sciam/technology'),
|
|
||||||
# (u'Biology', u'http://rss.sciam.com/sciam/biology'),
|
|
||||||
# (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'),
|
|
||||||
# (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'),
|
|
||||||
# (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'),
|
|
||||||
# (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'),
|
|
||||||
# (u'Math', u'http://rss.sciam.com/sciam/math'),
|
|
||||||
# (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'),
|
|
||||||
# (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'),
|
|
||||||
# (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
|
|
||||||
# ]
|
|
||||||
#
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
src = self.browser.open('http://www.sciam.com/sciammag/').read()
|
soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
|
||||||
root = html.fromstring(src)
|
month = soup.find(id='magazine-month')
|
||||||
self.cover_url = root.xpath('//img[re:match(@src, "cover_")]',
|
self.timefmt = ' [%s]'%(self.tag_to_string(month))
|
||||||
namespaces={'re':'http://exslt.org/regular-expressions'}
|
img = soup.find('img', alt='Scientific American Magazine', src=True)
|
||||||
)[0].get('src')
|
if img is not None:
|
||||||
self.timefmt = ' [%s]'%(root.xpath('//div[@id = "magazine-month"]')[0].text)
|
self.cover_url = img['src']
|
||||||
feeds = []
|
features, feeds = [], []
|
||||||
features = []
|
for p in soup.find(id='magazine-info').findAll('p') + \
|
||||||
for a in root.xpath('//a[@href and @title = "Feature"]'):
|
soup.find(id='magazine-info-more').findAll('p'):
|
||||||
if not a.text.strip():
|
all_as = p.findAll('a', href=True)
|
||||||
continue
|
a = all_as[0]
|
||||||
|
if a is None: continue
|
||||||
|
desc = ''
|
||||||
|
for s in p.find('span', attrs={'class':'sub'}):
|
||||||
|
desc += self.tag_to_string(s)
|
||||||
|
|
||||||
article = {
|
article = {
|
||||||
'url' : a.get('href'),
|
'url' : a.get('href'),
|
||||||
'title' : u''.join(a.xpath('./text()')),
|
'title' : self.tag_to_string(all_as[-1]),
|
||||||
'date' : '',
|
'date' : '',
|
||||||
'description' : '',
|
'description' : desc,
|
||||||
}
|
}
|
||||||
for s in a.itersiblings('span'):
|
|
||||||
if s.get('class', '') == 'sub':
|
|
||||||
article['description'] += u''.join(s.xpath('./text()')) + ' '
|
|
||||||
features.append(article)
|
features.append(article)
|
||||||
if features:
|
|
||||||
feeds.append(('Features', features))
|
feeds.append(('Features', features))
|
||||||
|
|
||||||
departments = []
|
section = []
|
||||||
for a in root.xpath('//a[@href and @class="title"]'):
|
found = []
|
||||||
txt = u''.join(a.xpath('./text()')).strip()
|
title = None
|
||||||
if not txt:
|
for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']):
|
||||||
|
if x.name == 'div':
|
||||||
|
if section:
|
||||||
|
feeds.append((title, section))
|
||||||
|
title = self.tag_to_string(x)
|
||||||
|
section = []
|
||||||
|
else:
|
||||||
|
if title is None or not a.get('href', False) or a.get('href', None) in found:
|
||||||
continue
|
continue
|
||||||
article = {
|
article = {
|
||||||
'url' : a.get('href'),
|
'url' : x['href'],
|
||||||
'title' : txt,
|
'title' : self.tag_to_string(x),
|
||||||
'date': '',
|
'date': '',
|
||||||
'description': '',
|
'description': '',
|
||||||
}
|
}
|
||||||
p = a.getparent()
|
section.append(article)
|
||||||
p.remove(a)
|
if section:
|
||||||
article['description'] = u''.join(p.xpath('./text()'))
|
feeds.append((title, section))
|
||||||
departments.append(article)
|
|
||||||
|
|
||||||
feeds.append(('Departments', departments))
|
articles = []
|
||||||
opinion = []
|
for a in soup.find(id='opinion').findAll('a', href=True):
|
||||||
for a in root.xpath('//div[@id = "opinion"]//a[@href]'):
|
articles.append({'url':a['href'], 'title':self.tag_to_string(a),
|
||||||
txt = u''.join(a.xpath('./text()')).strip()
|
'description':'', 'date':''})
|
||||||
if not txt:
|
feeds.append(('Opinion', articles))
|
||||||
continue
|
|
||||||
article = {
|
|
||||||
'url' : a.get('href'),
|
|
||||||
'title' : txt,
|
|
||||||
'date' : '',
|
|
||||||
'description' : '',
|
|
||||||
}
|
|
||||||
opinion.append(article)
|
|
||||||
feeds.append(('Opinion', opinion))
|
|
||||||
|
|
||||||
ontheweb = []
|
|
||||||
for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'):
|
|
||||||
txt = u''.join(a.xpath('./text()')).strip()
|
|
||||||
if not txt:
|
|
||||||
continue
|
|
||||||
article = {
|
|
||||||
'url' : a.get('href'),
|
|
||||||
'title' : txt,
|
|
||||||
'date' : '',
|
|
||||||
'description' : '',
|
|
||||||
}
|
|
||||||
ontheweb.append(article)
|
|
||||||
feeds.append(('On the web', ontheweb))
|
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
46
src/calibre/web/feeds/recipes/recipe_woz_die.py
Normal file
46
src/calibre/web/feeds/recipes/recipe_woz_die.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class WozDie(BasicNewsRecipe):
|
||||||
|
title = u'WOZ Die Wochenzeitung'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
language = _('German')
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_tags = [dict(name='p', attrs={'class':'arrow_top'})]
|
||||||
|
remove_tags = [dict(name='p', attrs={'class':'bottom_right'})]
|
||||||
|
remove_tags = [dict(name='script')]
|
||||||
|
extra_css = '''#print_titel{vertical-align: bottom; text-align:
|
||||||
|
left; color: #666666; background-color: white; padding-top: 30px; padding-
|
||||||
|
bottom: 10px; border-bottom: 1px solid #999999;} #title{text-align:
|
||||||
|
left; font-size: large; font-weight: 600; padding-top: 0px; padding-
|
||||||
|
bottom: 6px;} h3 {text-align: left; font-size: large; font-weight: 600;
|
||||||
|
padding-top: 0px; padding-bottom: 6px;} #lead{font-weight: 600;
|
||||||
|
padding-bottom: 6px;} h2{font-weight: 600; padding-bottom: 6px;}
|
||||||
|
#author{color: #666666; padding-top: 0px; padding-bottom: 0px;}
|
||||||
|
h4{color: #666666; padding-top: 0px; padding-bottom: 0px;} #author2
|
||||||
|
{color: #666666; padding-top: 0px; padding-bottom: 0px;} .dotted_line
|
||||||
|
{padding-top: 0px; margin-bottom: 18px; border-bottom: 1px dotted
|
||||||
|
#666666;} .intro{margin: 0 auto; font-weight: 600; padding-bottom:
|
||||||
|
18px;} h5{margin: 0 auto; font-weight: 600; padding-bottom: 18px;}
|
||||||
|
.intro2{margin: 0 auto; font-weight: 600;} .text{padding-bottom:
|
||||||
|
18px;} .subtitle{margin: 0 auto; font-weight: 600; padding-bottom:
|
||||||
|
10px;} .articletitle{margin: 0 auto; font-weight: 600; padding-bottom:
|
||||||
|
10px;} #content_infobox{margin-top: 20px; margin-left: 0px; margin-
|
||||||
|
right: 0px; margin-bottom: 10px; text-align: left; border-bottom: 1px
|
||||||
|
solid #999999;} .content_infobox_titel{padding-top: 6px; padding-
|
||||||
|
bottom: 8px; padding-left: 8px; padding-right: 8px; font-weight: 600;
|
||||||
|
border-top: 1px solid #999999; border-bottom: 1px dotted #999999;}
|
||||||
|
.content_infobox_text{padding-top: 6px; padding-bottom: 12px; padding-
|
||||||
|
left: 8px; padding-right: 8px;} .box_gray{padding-top: 4px; padding-
|
||||||
|
left: 7px; padding-right: 7px; padding-bottom: 4px;} .box_white {
|
||||||
|
padding-top: 4px; padding-left: 7px; padding-right: 7px; padding-bottom:
|
||||||
|
4px;} .content_infobox_mehr{margin-top: 20px; margin-left: 0px; margin-
|
||||||
|
right: 0px; margin-bottom: 10px; text-align: left; width: 600px; border-
|
||||||
|
bottom: 1px solid #999999;}'''
|
||||||
|
|
||||||
|
feeds = [('WOZ Die Wochenzeitung - Headlines',
|
||||||
|
'http://www.woz.ch/inhalt/headlinesRSS.php'),]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('rss/', 'print_')
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user