diff --git a/resources/recipes/the_h.recipe b/resources/recipes/the_h.recipe index dbfad7e32a..28a1571dc5 100644 --- a/resources/recipes/the_h.recipe +++ b/resources/recipes/the_h.recipe @@ -14,7 +14,7 @@ class TheHeiseOnline(BasicNewsRecipe): oldest_article = 3 description = 'In association with Heise Online' publisher = 'Heise Media UK Ltd.' - category = 'news, technology, security' + category = 'news, technology, security, OSS, internet' max_articles_per_feed = 100 language = 'en' encoding = 'utf-8' @@ -27,6 +27,12 @@ class TheHeiseOnline(BasicNewsRecipe): feeds = [ (u'The H News Feed', u'http://www.h-online.com/news/atom.xml') ] + cover_url = 'http://www.h-online.com/icons/logo_theH.gif' + + remove_tags = [ + dict(id="logo"), + dict(id="footer") + ] def print_version(self, url): return url + '?view=print' diff --git a/resources/recipes/toyokeizai.recipe b/resources/recipes/toyokeizai.recipe index 3aed2b2202..395a8bb9b7 100644 --- a/resources/recipes/toyokeizai.recipe +++ b/resources/recipes/toyokeizai.recipe @@ -17,35 +17,44 @@ class Toyokeizai(BasicNewsRecipe): category = 'economy, magazine, japan' language = 'ja' encoding = 'euc-jp' - index = 'http://www.toyokeizai.net/news/' + index = 'http://member.toyokeizai.net/news/' remove_javascript = True - no_stylesheet = True + no_stylesheets = True masthead_title = u'TOYOKEIZAI' needs_subscription = True timefmt = '[%y/%m/%d]' + recursions = 5 + match_regexps =[ r'page/\d+'] - keep_only_tags = [dict(name='div', attrs={'class':['news']}), - dict(name='div', attrs={'class':["news_con"]}) + keep_only_tags = [ + dict(name='div', attrs={'class':['news']}), + dict(name='div', attrs={'class':["news_cont"]}), + dict(name='div', attrs={'class':["news_con"]}), +# dict(name='div', attrs={'class':["norightsMessage"]}) ] - remove_tags = [{'class':"mt35 mgz"}] + remove_tags = [{'class':"mt35 mgz"}, + {'class':"mt20 newzia"}, + {'class':"mt20 fontS"}, + {'class':"bk_btn_m"}, + dict(id='newzia_connect_member') + ] def parse_index(self): feeds = [] soup = self.index_to_soup(self.index) topstories = soup.find('ul',attrs={'class':'list6'}) if topstories: - newsarticles = [] - for itt in topstories.findAll('li'): + newsarticles = [] + for itt in topstories.findAll('li'): itema = itt.find('a',href=True) itemd = itt.find('span') newsarticles.append({ 'title' :itema.string ,'date' :re.compile(r"\- ").sub("",itemd.string) - ,'url' :'http://www.toyokeizai.net' + itema['href'] - # ,'description':itema['title'] - ,'description':'' + ,'url' :'http://member.toyokeizai.net' + itema['href'] + ,'description':itema['title'] }) - feeds.append(('news', newsarticles)) + feeds.append(('news', newsarticles)) return feeds def get_browser(self): @@ -57,10 +66,3 @@ class Toyokeizai(BasicNewsRecipe): br['password'] = self.password res = br.submit() return br - - def is_link_wanted(url,tag): - if re.compile(r'page//[0-9]+//$').search(url): - return True - return False - -