From a2745fed4c35837816fd051c5c16c9757fdebed4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 17 Feb 2009 20:04:48 -0800 Subject: [PATCH] New recipe for Physicsworld and updated Ars Technica recipe for multi-page support --- src/calibre/web/feeds/recipes/__init__.py | 2 +- .../web/feeds/recipes/recipe_ars_technica.py | 21 +++++++++-- .../web/feeds/recipes/recipe_physics_world.py | 35 +++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 src/calibre/web/feeds/recipes/recipe_physics_world.py diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 76bb8112ec..98f2332bba 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -29,7 +29,7 @@ recipe_modules = ['recipe_' + r for r in ( 'jb_online', 'estadao', 'o_globo', 'vijesti', 'elmundo', 'the_oz', 'honoluluadvertiser', 'starbulletin', 'exiled', 'indy_star', 'dna', 'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices', - 'hindu', 'cincinnati_enquirer', + 'hindu', 'cincinnati_enquirer', 'physics_world', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_ars_technica.py b/src/calibre/web/feeds/recipes/recipe_ars_technica.py index eb4e3f54f4..2561a463d8 100644 --- a/src/calibre/web/feeds/recipes/recipe_ars_technica.py +++ b/src/calibre/web/feeds/recipes/recipe_ars_technica.py @@ -15,7 +15,6 @@ class ArsTechnica2(BasicNewsRecipe): description = 'The art of technology' publisher = 'Ars Technica' category = 'news, IT, technology' - language = _('English') oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True @@ -50,11 +49,29 @@ class ArsTechnica2(BasicNewsRecipe): ,(u'Nobel Intent (Science content)' , u'http://feeds.arstechnica.com/arstechnica/science/' ) ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/') ] - + + def append_page(self, soup, appendtag, position): + pager = soup.find('div',attrs={'id':'pager'}) + if pager: + for atag in pager.findAll('a',href=True): + str = self.tag_to_string(atag) + if str.startswith('Next'): + soup2 = self.index_to_soup(atag['href']) + texttag = soup2.find('div', attrs={'class':'news-item-text'}) + for it in texttag.findAll(style=True): + del it['style'] + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + texttag.extract() + pager.extract() + appendtag.insert(position,texttag) + + def preprocess_html(self, soup): ftag = soup.find('div', attrs={'class':'news-item-byline'}) if ftag: ftag.insert(4,'

') for item in soup.findAll(style=True): del item['style'] + self.append_page(soup, soup.body, 3) return soup diff --git a/src/calibre/web/feeds/recipes/recipe_physics_world.py b/src/calibre/web/feeds/recipes/recipe_physics_world.py new file mode 100644 index 0000000000..56ed2460c1 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_physics_world.py @@ -0,0 +1,35 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class PhysicsWorld(BasicNewsRecipe): + title = u'Physicsworld' + description = 'News from the world of physics' + __author__ = 'Hypernova' + language = _('English') + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_javascript = True + needs_subscription = True + remove_tags_before = dict(name='h1') + remove_tags_after = [dict(name='div', attrs={'id':'shareThis'})] + preprocess_regexps = [ + (re.compile(r'
.*', re.DOTALL|re.IGNORECASE), + lambda match: ''), +] + feeds = [ + (u'Headlines News', u'http://feeds.feedburner.com/PhysicsWorldNews') + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username is not None and self.password is not None: + br.open('http://physicsworld.com/cws/sign-in') + br.select_form(nr=1) + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + + \ No newline at end of file