Fix recipe for The New Yorker and improve recipe for Scientific American. Fixes #1090 (New Yorker feed)

This commit is contained in:
Kovid Goyal 2008-09-29 11:36:33 -07:00
parent 5d91dfe0f7
commit 4170d9125f
2 changed files with 16 additions and 4 deletions

View File

@ -3,7 +3,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, time import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString from calibre.ebooks.BeautifulSoup import NavigableString
@ -21,7 +21,7 @@ class NewYorker(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
toc_pat = re.compile(time.strftime(r'.+magazine/toc/%Y/%m/.+toc_%Y\d+')) toc_pat = re.compile(r'/magazine/toc/\d+/\d+/\d+/toc_\d+')
soup = self.soup(self.browser.open('http://www.newyorker.com/').read()) soup = self.soup(self.browser.open('http://www.newyorker.com/').read())
a = soup.find('a', href=toc_pat) a = soup.find('a', href=toc_pat)
if a is None: if a is None:

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
''' '''
sciam.com sciam.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ScientificAmerican(BasicNewsRecipe): class ScientificAmerican(BasicNewsRecipe):
@ -18,8 +18,14 @@ class ScientificAmerican(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
remove_tags_before = dict(name='div', attrs={'class':'headline'}) remove_tags_before = dict(name='div', attrs={'class':'headline'})
remove_tags_after = dict(id='article') remove_tags_after = dict(id='article')
remove_tags = [dict(id='sharetools'), dict(id='reddit')] remove_tags = [
dict(id=['sharetools', 'reddit']),
dict(name='script'),
{"class": re.compile(r'also-in-this')}
]
html2lrf_options = ['--base-font-size', '8'] html2lrf_options = ['--base-font-size', '8']
recursions = 1
match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
feeds = [ feeds = [
(u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'),
(u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'),
@ -36,3 +42,9 @@ class ScientificAmerican(BasicNewsRecipe):
(u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'),
(u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
] ]
def postprocess_html(self, soup):
if soup is not None:
for span in soup.findAll('span', attrs={'class':'pagination'}):
span.extract()
return soup