From 215007e1600662f049c08a90e16a52fab1d27418 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sat, 4 Dec 2010 01:33:57 +0900 Subject: [PATCH 1/5] recipe: mainichi_it_news: fix typo --- resources/recipes/mainichi_it_news.recipe | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe index 4c285a2c01..732e47bf8e 100644 --- a/resources/recipes/mainichi_it_news.recipe +++ b/resources/recipes/mainichi_it_news.recipe @@ -14,6 +14,7 @@ class MainichiDailyITNews(BasicNewsRecipe): remove_tags_before = {'class':"NewsTitle"} remove_tags = [{'class':"RelatedArticle"}] + remove_tags_after = {'class':"Credit"} def parse_feeds(self): @@ -29,4 +30,4 @@ class MainichiDailyITNews(BasicNewsRecipe): index = curfeed.articles.index(d) curfeed.articles[index:index+1] = [] - return feeds remove_tags_after = {'class':"Credit"} + return feeds From 78f9920c3a9dd4c9fc7bbb077d0cc86a374e01a2 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 7 Dec 2010 06:34:52 +0900 Subject: [PATCH 2/5] recipe: add toyokeizai news. --- resources/recipes/toyokeizai.recipe | 61 +++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 resources/recipes/toyokeizai.recipe diff --git a/resources/recipes/toyokeizai.recipe b/resources/recipes/toyokeizai.recipe new file mode 100644 index 0000000000..7145ece707 --- /dev/null +++ b/resources/recipes/toyokeizai.recipe @@ -0,0 +1,61 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.toyokeizai.net +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Toyokeizai(BasicNewsRecipe): + title = u'ToyoKeizai' + __author__ = 'Hiroshi Miura' + oldest_article = 1 + max_articles_per_feed = 50 + description = 'Japanese traditional financial and business magazine' + publisher = 'Toyokeizai Shinbun Sha' + category = 'news, japan' + language = 'ja' + encoding = 'euc-jp' + index = 'http://www.toyokeizai.net/news/' + remove_javascript = True + no_stylesheet = True + masthead_title = u'TOYOKEIZAI' + needs_subscription = True + timefmt = '[%y/%m/%d]' + + keep_only_tags = [dict(name='div', attrs={'class':['news']}), + dict(name='div', attrs={'class':["news_con"]}) + ] + remove_tags = [{'class':"mt35 mgz"}] + + def parse_index(self): + feeds = [] + soup = self.index_to_soup(self.index) + topstories = soup.find('ul',attrs={'class':'list6'}) + if topstories: + newsarticles = [] + for itt in topstories.findAll('li'): + itema = itt.find('a',href=True) + itemd = itt.find('span') + newsarticles.append({ + 'title' :itema.string + ,'date' :re.compile(r"\- ").sub(" ",itemd.string) + ,'url' :'http://www.toyokeizai.net' + itema['href'] + # ,'description':itema['title'] + ,'description':'' + }) + feeds.append(('news', newsarticles)) + return feeds + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://member.toyokeizai.net/norights/form/') + br.select_form(nr=0) + br['kaiin_id'] = self.username + br['password'] = self.password + res = br.submit() + return br + + From 4d7bab28b6879c569d9ffb3e6cbde673dc688738 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 7 Dec 2010 07:37:08 +0900 Subject: [PATCH 3/5] recipe: toyokeizai: add description about limit --- resources/recipes/toyokeizai.recipe | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/resources/recipes/toyokeizai.recipe b/resources/recipes/toyokeizai.recipe index 7145ece707..3aed2b2202 100644 --- a/resources/recipes/toyokeizai.recipe +++ b/resources/recipes/toyokeizai.recipe @@ -8,13 +8,13 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Toyokeizai(BasicNewsRecipe): - title = u'ToyoKeizai' + title = u'ToyoKeizai News' __author__ = 'Hiroshi Miura' oldest_article = 1 max_articles_per_feed = 50 - description = 'Japanese traditional financial and business magazine' + description = 'Japanese traditional economy and business magazine, only for advanced subscribers supported' publisher = 'Toyokeizai Shinbun Sha' - category = 'news, japan' + category = 'economy, magazine, japan' language = 'ja' encoding = 'euc-jp' index = 'http://www.toyokeizai.net/news/' @@ -40,7 +40,7 @@ class Toyokeizai(BasicNewsRecipe): itemd = itt.find('span') newsarticles.append({ 'title' :itema.string - ,'date' :re.compile(r"\- ").sub(" ",itemd.string) + ,'date' :re.compile(r"\- ").sub("",itemd.string) ,'url' :'http://www.toyokeizai.net' + itema['href'] # ,'description':itema['title'] ,'description':'' @@ -58,4 +58,9 @@ class Toyokeizai(BasicNewsRecipe): res = br.submit() return br + def is_link_wanted(url,tag): + if re.compile(r'page//[0-9]+//$').search(url): + return True + return False + From 403fd4d9c5b4b28c313fd269711c291c997f454a Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Wed, 8 Dec 2010 07:52:16 +0900 Subject: [PATCH 4/5] recipe: mainichi: fix missing import --- resources/recipes/mainichi_it_news.recipe | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe index 732e47bf8e..eddab149cd 100644 --- a/resources/recipes/mainichi_it_news.recipe +++ b/resources/recipes/mainichi_it_news.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re class MainichiDailyITNews(BasicNewsRecipe): title = u'\u6bce\u65e5\u65b0\u805e(IT&\u5bb6\u96fb)' From 4e4c2b7e68d38cdc0c4cbb4303c74d88530cf2b2 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Wed, 8 Dec 2010 23:43:54 +0900 Subject: [PATCH 5/5] recipes: fix errors and styles --- resources/recipes/the_h.recipe | 8 +++++- resources/recipes/toyokeizai.recipe | 38 +++++++++++++++-------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/resources/recipes/the_h.recipe b/resources/recipes/the_h.recipe index dbfad7e32a..28a1571dc5 100644 --- a/resources/recipes/the_h.recipe +++ b/resources/recipes/the_h.recipe @@ -14,7 +14,7 @@ class TheHeiseOnline(BasicNewsRecipe): oldest_article = 3 description = 'In association with Heise Online' publisher = 'Heise Media UK Ltd.' - category = 'news, technology, security' + category = 'news, technology, security, OSS, internet' max_articles_per_feed = 100 language = 'en' encoding = 'utf-8' @@ -27,6 +27,12 @@ class TheHeiseOnline(BasicNewsRecipe): feeds = [ (u'The H News Feed', u'http://www.h-online.com/news/atom.xml') ] + cover_url = 'http://www.h-online.com/icons/logo_theH.gif' + + remove_tags = [ + dict(id="logo"), + dict(id="footer") + ] def print_version(self, url): return url + '?view=print' diff --git a/resources/recipes/toyokeizai.recipe b/resources/recipes/toyokeizai.recipe index 3aed2b2202..395a8bb9b7 100644 --- a/resources/recipes/toyokeizai.recipe +++ b/resources/recipes/toyokeizai.recipe @@ -17,35 +17,44 @@ class Toyokeizai(BasicNewsRecipe): category = 'economy, magazine, japan' language = 'ja' encoding = 'euc-jp' - index = 'http://www.toyokeizai.net/news/' + index = 'http://member.toyokeizai.net/news/' remove_javascript = True - no_stylesheet = True + no_stylesheets = True masthead_title = u'TOYOKEIZAI' needs_subscription = True timefmt = '[%y/%m/%d]' + recursions = 5 + match_regexps =[ r'page/\d+'] - keep_only_tags = [dict(name='div', attrs={'class':['news']}), - dict(name='div', attrs={'class':["news_con"]}) + keep_only_tags = [ + dict(name='div', attrs={'class':['news']}), + dict(name='div', attrs={'class':["news_cont"]}), + dict(name='div', attrs={'class':["news_con"]}), +# dict(name='div', attrs={'class':["norightsMessage"]}) ] - remove_tags = [{'class':"mt35 mgz"}] + remove_tags = [{'class':"mt35 mgz"}, + {'class':"mt20 newzia"}, + {'class':"mt20 fontS"}, + {'class':"bk_btn_m"}, + dict(id='newzia_connect_member') + ] def parse_index(self): feeds = [] soup = self.index_to_soup(self.index) topstories = soup.find('ul',attrs={'class':'list6'}) if topstories: - newsarticles = [] - for itt in topstories.findAll('li'): + newsarticles = [] + for itt in topstories.findAll('li'): itema = itt.find('a',href=True) itemd = itt.find('span') newsarticles.append({ 'title' :itema.string ,'date' :re.compile(r"\- ").sub("",itemd.string) - ,'url' :'http://www.toyokeizai.net' + itema['href'] - # ,'description':itema['title'] - ,'description':'' + ,'url' :'http://member.toyokeizai.net' + itema['href'] + ,'description':itema['title'] }) - feeds.append(('news', newsarticles)) + feeds.append(('news', newsarticles)) return feeds def get_browser(self): @@ -57,10 +66,3 @@ class Toyokeizai(BasicNewsRecipe): br['password'] = self.password res = br.submit() return br - - def is_link_wanted(url,tag): - if re.compile(r'page//[0-9]+//$').search(url): - return True - return False - -