From 4170d9125fac954cbd7316e8c8440066d37c3274 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 29 Sep 2008 11:36:33 -0700 Subject: [PATCH] Fix recipe for The New Yorker and improve recipe for Scientific American. Fixes #1090 (New Yorker feed) --- src/calibre/web/feeds/recipes/new_yorker.py | 4 ++-- .../web/feeds/recipes/scientific_american.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/calibre/web/feeds/recipes/new_yorker.py b/src/calibre/web/feeds/recipes/new_yorker.py index 3e8d324f6a..ed9811c43b 100644 --- a/src/calibre/web/feeds/recipes/new_yorker.py +++ b/src/calibre/web/feeds/recipes/new_yorker.py @@ -3,7 +3,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -import re, time +import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import NavigableString @@ -21,7 +21,7 @@ class NewYorker(BasicNewsRecipe): def parse_index(self): - toc_pat = re.compile(time.strftime(r'.+magazine/toc/%Y/%m/.+toc_%Y\d+')) + toc_pat = re.compile(r'/magazine/toc/\d+/\d+/\d+/toc_\d+') soup = self.soup(self.browser.open('http://www.newyorker.com/').read()) a = soup.find('a', href=toc_pat) if a is None: diff --git a/src/calibre/web/feeds/recipes/scientific_american.py b/src/calibre/web/feeds/recipes/scientific_american.py index 6c6c679bc5..b9ca0f131f 100644 --- a/src/calibre/web/feeds/recipes/scientific_american.py +++ b/src/calibre/web/feeds/recipes/scientific_american.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' ''' sciam.com ''' - +import re from calibre.web.feeds.news import BasicNewsRecipe class ScientificAmerican(BasicNewsRecipe): @@ -18,8 +18,14 @@ class ScientificAmerican(BasicNewsRecipe): use_embedded_content = False remove_tags_before = dict(name='div', attrs={'class':'headline'}) remove_tags_after = dict(id='article') - remove_tags = [dict(id='sharetools'), dict(id='reddit')] + remove_tags = [ + dict(id=['sharetools', 'reddit']), + dict(name='script'), + {"class": re.compile(r'also-in-this')} + ] html2lrf_options = ['--base-font-size', '8'] + recursions = 1 + match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)'] feeds = [ (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), @@ -36,3 +42,9 @@ class ScientificAmerican(BasicNewsRecipe): (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') ] + + def postprocess_html(self, soup): + if soup is not None: + for span in soup.findAll('span', attrs={'class':'pagination'}): + span.extract() + return soup