From 521a8f93c1746725b36ca4f3ee6a4183e0b90742 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 6 May 2014 16:39:58 +0900 Subject: [PATCH 1/2] update japanese recipes - update: endgaget jp Kahoku Shinpo MSN Sankei News Uni no Himitsu - remove problematic recipes: nikkei_sub* chou chou blog Signed-off-by: Hiroshi Miura --- recipes/chouchoublog.recipe | 37 ---------- recipes/endgadget_ja.recipe | 13 ++-- recipes/kahokushinpo.recipe | 10 +-- recipes/msnsankei.recipe | 5 +- recipes/nikkei_sub_economy.recipe | 110 ----------------------------- recipes/nikkei_sub_industry.recipe | 107 ---------------------------- recipes/nikkei_sub_life.recipe | 104 --------------------------- recipes/nikkei_sub_main.recipe | 103 --------------------------- recipes/nikkei_sub_shakai.recipe | 102 -------------------------- recipes/nikkei_sub_sports.recipe | 108 ---------------------------- recipes/uninohimitu.recipe | 4 +- 11 files changed, 13 insertions(+), 690 deletions(-) delete mode 100644 recipes/chouchoublog.recipe delete mode 100644 recipes/nikkei_sub_economy.recipe delete mode 100644 recipes/nikkei_sub_industry.recipe delete mode 100644 recipes/nikkei_sub_life.recipe delete mode 100644 recipes/nikkei_sub_main.recipe delete mode 100644 recipes/nikkei_sub_shakai.recipe delete mode 100644 recipes/nikkei_sub_sports.recipe diff --git a/recipes/chouchoublog.recipe b/recipes/chouchoublog.recipe deleted file mode 100644 index 8c953deef0..0000000000 --- a/recipes/chouchoublog.recipe +++ /dev/null @@ -1,37 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -http://ameblo.jp/ -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - -class SakuraBlog(BasicNewsRecipe): - title = u'chou chou blog' - __author__ = 'Hiroshi Miura' - oldest_article = 4 - publication_type = 'blog' - max_articles_per_feed = 20 - description = 'Japanese popular dog blog' - publisher = '' - category = 'dog, pet, japan' - language = 'ja' - encoding = 'utf-8' - use_embedded_content = True - - feeds = [(u'blog', u'http://feedblog.ameba.jp/rss/ameblo/chouchou1218/rss20.xml')] - - def parse_feeds(self): - feeds = BasicNewsRecipe.parse_feeds(self) - for curfeed in feeds: - delList = [] - for a,curarticle in enumerate(curfeed.articles): - if re.search(r'rssad.jp', curarticle.url): - delList.append(curarticle) - if len(delList)>0: - for d in delList: - index = curfeed.articles.index(d) - curfeed.articles[index:index+1] = [] - return feeds - diff --git a/recipes/endgadget_ja.recipe b/recipes/endgadget_ja.recipe index 7eca0a6966..5c030caef1 100644 --- a/recipes/endgadget_ja.recipe +++ b/recipes/endgadget_ja.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' +__copyright__ = '2010,2014, Hiroshi Miura ' ''' japan.engadget.com ''' @@ -20,19 +20,20 @@ class EndgadgetJapan(BasicNewsRecipe): index = 'http://japanese.engadget.com/' remove_javascript = True - remove_tags_before = dict(name="h1", attrs={'class':"post_title"}) - remove_tags_after = dict(name='div', attrs={'class':'post_body'}) + remove_tags_before = dict(name="header", attrs={'class':"header"}) + remove_tags_after = dict(name='div', attrs={'class':'post-meta'}) def parse_index(self): feeds = [] newsarticles = [] soup = self.index_to_soup(self.index) - for topstories in soup.findAll('div',attrs={'class':'post_content'}): - itt = topstories.find('h4') + for topstories in soup.findAll('header',attrs={'class':'post-header'}): + itt = topstories.find('h2') itema = itt.find('a',href=True) + itemtime = topstories.find('span',attrs={'class':'time'}) newsarticles.append({ 'title' :itema.string - ,'date' :'' + ,'date' :itemtime.string ,'url' :itema['href'] ,'description':'' }) diff --git a/recipes/kahokushinpo.recipe b/recipes/kahokushinpo.recipe index 06879a1375..88ac978c6a 100644 --- a/recipes/kahokushinpo.recipe +++ b/recipes/kahokushinpo.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' +__copyright__ = '2010,2014, Hiroshi Miura ' ''' www.kahoku.co.jp ''' @@ -21,11 +21,5 @@ class KahokuShinpoNews(BasicNewsRecipe): feeds = [(u'news', u'http://www.kahoku.co.jp/rss/index_thk.xml')] - keep_only_tags = [ dict(id="page_title"), - dict(id="news_detail"), - dict(id="bt_title"), - {'class':"photoLeft"}, - dict(id="bt_body") - ] - remove_tags = [ {'class':"button"}] + keep_only_tags = [ {'class':"category"},{'class':"ttl"},{'class':'photoimg'},{'class':"txt"},{'class':"data"}] diff --git a/recipes/msnsankei.recipe b/recipes/msnsankei.recipe index 59664d055f..4bd61d295c 100644 --- a/recipes/msnsankei.recipe +++ b/recipes/msnsankei.recipe @@ -1,6 +1,6 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' +__copyright__ = '2010,2014, Hiroshi Miura ' ''' sankei.jp.msn.com ''' @@ -20,5 +20,4 @@ class MSNSankeiNewsProduct(BasicNewsRecipe): feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')] - remove_tags_before = dict(id="NewsTitle") - remove_tags_after = dict(id="RelatedTitle") + keep_only_tags = [dict(id=['MainContent'])] diff --git a/recipes/nikkei_sub_economy.recipe b/recipes/nikkei_sub_economy.recipe deleted file mode 100644 index 7a256f7553..0000000000 --- a/recipes/nikkei_sub_economy.recipe +++ /dev/null @@ -1,110 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -www.nikkei.com -''' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -import mechanize -from calibre.ptempfile import PersistentTemporaryFile - -class NikkeiNet_sub_economy(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7d4c\u6e08)' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' - cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - needs_subscription = True - oldest_article = 2 - max_articles_per_feed = 20 - language = 'ja' - remove_javascript = False - temp_files = [] - - remove_tags_before = {'class':"cmn-section cmn-indent"} - remove_tags = [ - {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, - {'class':"cmn-article_keyword cmn-clearfix"}, - {'class':"cmn-print_headline cmn-clearfix"}, - {'class':"cmn-article_list"}, - dict(id="ABOUT-NIKKEI"), - {'class':"cmn-sub_market"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - - feeds = [ (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), - (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), - (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), - (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), - (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), - (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), - (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), - (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), - ] - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - - cj = mechanize.LWPCookieJar() - br.set_cookiejar(cj) - - #br.set_debug_http(True) - #br.set_debug_redirects(True) - #br.set_debug_responses(True) - - if self.username is not None and self.password is not None: - #print "----------------------------get login form--------------------------------------------" - # open login form - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') - response = br.response() - #print "----------------------------get login form---------------------------------------------" - #print "----------------------------set login form---------------------------------------------" - # remove disabled input which brings error on mechanize - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True - br.submit() - br.response() - #print "----------------------------send login form---------------------------------------------" - #print "----------------------------open news main page-----------------------------------------" - # open news site - br.open('http://www.nikkei.com/') - br.response() - #print "----------------------------www.nikkei.com BODY --------------------------------------" - #print response2.get_data() - #print "-------------------------^^-got auto redirect form----^^--------------------------------" - # forced redirect in default - br.select_form(nr=0) - br.submit() - response3 = br.response() - # return some cookie which should be set by Javascript - #print response3.geturl() - raw = response3.get_data() - #print "---------------------------response to form --------------------------------------------" - # grab cookie from JS and set it - redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) - br.select_form(nr=0) - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write("#LWP-Cookies-2.0\n") - - self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].close() - cj.load(self.temp_files[-1].name) - - br.submit() - - #br.set_debug_http(False) - #br.set_debug_redirects(False) - #br.set_debug_responses(False) - return br - - - - diff --git a/recipes/nikkei_sub_industry.recipe b/recipes/nikkei_sub_industry.recipe deleted file mode 100644 index 11a17b2415..0000000000 --- a/recipes/nikkei_sub_industry.recipe +++ /dev/null @@ -1,107 +0,0 @@ - -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -www.nikkei.com -''' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -import mechanize -from calibre.ptempfile import PersistentTemporaryFile - - -class NikkeiNet_sub_industory(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' - cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - needs_subscription = True - oldest_article = 2 - max_articles_per_feed = 20 - language = 'ja' - remove_javascript = False - temp_files = [] - - remove_tags_before = {'class':"cmn-section cmn-indent"} - remove_tags = [ - {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, - {'class':"cmn-article_keyword cmn-clearfix"}, - {'class':"cmn-print_headline cmn-clearfix"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - - feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), - (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), - (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), - (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), - (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), - - ] - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - - cj = mechanize.LWPCookieJar() - br.set_cookiejar(cj) - - #br.set_debug_http(True) - #br.set_debug_redirects(True) - #br.set_debug_responses(True) - - if self.username is not None and self.password is not None: - #print "----------------------------get login form--------------------------------------------" - # open login form - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') - response = br.response() - #print "----------------------------get login form---------------------------------------------" - #print "----------------------------set login form---------------------------------------------" - # remove disabled input which brings error on mechanize - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True - br.submit() - br.response() - #print "----------------------------send login form---------------------------------------------" - #print "----------------------------open news main page-----------------------------------------" - # open news site - br.open('http://www.nikkei.com/') - br.response() - #print "----------------------------www.nikkei.com BODY --------------------------------------" - #print response2.get_data() - #print "-------------------------^^-got auto redirect form----^^--------------------------------" - # forced redirect in default - br.select_form(nr=0) - br.submit() - response3 = br.response() - # return some cookie which should be set by Javascript - #print response3.geturl() - raw = response3.get_data() - #print "---------------------------response to form --------------------------------------------" - # grab cookie from JS and set it - redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) - br.select_form(nr=0) - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write("#LWP-Cookies-2.0\n") - - self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].close() - cj.load(self.temp_files[-1].name) - - br.submit() - - #br.set_debug_http(False) - #br.set_debug_redirects(False) - #br.set_debug_responses(False) - return br - - - - diff --git a/recipes/nikkei_sub_life.recipe b/recipes/nikkei_sub_life.recipe deleted file mode 100644 index c2b908ca98..0000000000 --- a/recipes/nikkei_sub_life.recipe +++ /dev/null @@ -1,104 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -www.nikkei.com -''' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -import mechanize -from calibre.ptempfile import PersistentTemporaryFile - - -class NikkeiNet_sub_life(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' - cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - needs_subscription = True - oldest_article = 2 - max_articles_per_feed = 20 - language = 'ja' - remove_javascript = False - temp_files = [] - - remove_tags_before = {'class':"cmn-section cmn-indent"} - remove_tags = [ - {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, - {'class':"cmn-article_keyword cmn-clearfix"}, - {'class':"cmn-print_headline cmn-clearfix"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - - feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), - (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), - (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), - (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special') - ] - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - - cj = mechanize.LWPCookieJar() - br.set_cookiejar(cj) - - #br.set_debug_http(True) - #br.set_debug_redirects(True) - #br.set_debug_responses(True) - - if self.username is not None and self.password is not None: - #print "----------------------------get login form--------------------------------------------" - # open login form - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') - response = br.response() - #print "----------------------------get login form---------------------------------------------" - #print "----------------------------set login form---------------------------------------------" - # remove disabled input which brings error on mechanize - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True - br.submit() - br.response() - #print "----------------------------send login form---------------------------------------------" - #print "----------------------------open news main page-----------------------------------------" - # open news site - br.open('http://www.nikkei.com/') - br.response() - #print "----------------------------www.nikkei.com BODY --------------------------------------" - #print response2.get_data() - #print "-------------------------^^-got auto redirect form----^^--------------------------------" - # forced redirect in default - br.select_form(nr=0) - br.submit() - response3 = br.response() - # return some cookie which should be set by Javascript - #print response3.geturl() - raw = response3.get_data() - #print "---------------------------response to form --------------------------------------------" - # grab cookie from JS and set it - redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) - br.select_form(nr=0) - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write("#LWP-Cookies-2.0\n") - - self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].close() - cj.load(self.temp_files[-1].name) - - br.submit() - - #br.set_debug_http(False) - #br.set_debug_redirects(False) - #br.set_debug_responses(False) - return br - - - - diff --git a/recipes/nikkei_sub_main.recipe b/recipes/nikkei_sub_main.recipe deleted file mode 100644 index 84503cccf3..0000000000 --- a/recipes/nikkei_sub_main.recipe +++ /dev/null @@ -1,103 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -www.nikkei.com -''' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -import mechanize -from calibre.ptempfile import PersistentTemporaryFile - - -class NikkeiNet_sub_main(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7dcf\u5408)' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' - cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - needs_subscription = True - oldest_article = 2 - max_articles_per_feed = 20 - language = 'ja' - remove_javascript = False - temp_files = [] - - remove_tags_before = {'class':"cmn-section cmn-indent"} - remove_tags = [ - {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, - {'class':"cmn-article_keyword cmn-clearfix"}, - {'class':"cmn-print_headline cmn-clearfix"}, - {'class':"cmn-article_list"}, - {'class':"cmn-dashedline"}, - {'class':"cmn-hide"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - - feeds = [ (u'NIKKEI', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=main')] - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - - cj = mechanize.LWPCookieJar() - br.set_cookiejar(cj) - - #br.set_debug_http(True) - #br.set_debug_redirects(True) - #br.set_debug_responses(True) - - if self.username is not None and self.password is not None: - #print "----------------------------get login form--------------------------------------------" - # open login form - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') - response = br.response() - #print "----------------------------get login form---------------------------------------------" - #print "----------------------------set login form---------------------------------------------" - # remove disabled input which brings error on mechanize - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True - br.submit() - br.response() - #print "----------------------------send login form---------------------------------------------" - #print "----------------------------open news main page-----------------------------------------" - # open news site - br.open('http://www.nikkei.com/') - br.response() - #print "----------------------------www.nikkei.com BODY --------------------------------------" - #print response2.get_data() - #print "-------------------------^^-got auto redirect form----^^--------------------------------" - # forced redirect in default - br.select_form(nr=0) - br.submit() - response3 = br.response() - # return some cookie which should be set by Javascript - #print response3.geturl() - raw = response3.get_data() - #print "---------------------------response to form --------------------------------------------" - # grab cookie from JS and set it - redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) - br.select_form(nr=0) - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write("#LWP-Cookies-2.0\n") - - self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].close() - cj.load(self.temp_files[-1].name) - - br.submit() - - #br.set_debug_http(False) - #br.set_debug_redirects(False) - #br.set_debug_responses(False) - return br - - - - diff --git a/recipes/nikkei_sub_shakai.recipe b/recipes/nikkei_sub_shakai.recipe deleted file mode 100644 index be21b3c43f..0000000000 --- a/recipes/nikkei_sub_shakai.recipe +++ /dev/null @@ -1,102 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -www.nikkei.com -''' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -import mechanize -from calibre.ptempfile import PersistentTemporaryFile - - -class NikkeiNet_sub_shakai(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Social)' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' - cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - needs_subscription = True - oldest_article = 2 - max_articles_per_feed = 20 - language = 'ja' - remove_javascript = False - temp_files = [] - - remove_tags_before = {'class':"cmn-section cmn-indent"} - remove_tags = [ - {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, - {'class':"cmn-article_keyword cmn-clearfix"}, - {'class':"cmn-print_headline cmn-clearfix"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - - feeds = [ - (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai') - ] - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - - cj = mechanize.LWPCookieJar() - br.set_cookiejar(cj) - - #br.set_debug_http(True) - #br.set_debug_redirects(True) - #br.set_debug_responses(True) - - if self.username is not None and self.password is not None: - #print "----------------------------get login form--------------------------------------------" - # open login form - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') - response = br.response() - #print "----------------------------get login form---------------------------------------------" - #print "----------------------------set login form---------------------------------------------" - # remove disabled input which brings error on mechanize - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True - br.submit() - br.response() - #print "----------------------------send login form---------------------------------------------" - #print "----------------------------open news main page-----------------------------------------" - # open news site - br.open('http://www.nikkei.com/') - br.response() - #print "----------------------------www.nikkei.com BODY --------------------------------------" - #print response2.get_data() - #print "-------------------------^^-got auto redirect form----^^--------------------------------" - # forced redirect in default - br.select_form(nr=0) - br.submit() - response3 = br.response() - # return some cookie which should be set by Javascript - #print response3.geturl() - raw = response3.get_data() - #print "---------------------------response to form --------------------------------------------" - # grab cookie from JS and set it - redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) - br.select_form(nr=0) - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write("#LWP-Cookies-2.0\n") - - self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].close() - cj.load(self.temp_files[-1].name) - - br.submit() - - #br.set_debug_http(False) - #br.set_debug_redirects(False) - #br.set_debug_responses(False) - return br - - - - diff --git a/recipes/nikkei_sub_sports.recipe b/recipes/nikkei_sub_sports.recipe deleted file mode 100644 index 47e335a4c3..0000000000 --- a/recipes/nikkei_sub_sports.recipe +++ /dev/null @@ -1,108 +0,0 @@ - -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -www.nikkei.com -''' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -import mechanize -from calibre.ptempfile import PersistentTemporaryFile - - -class NikkeiNet_sub_sports(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u30b9\u30dd\u30fc\u30c4)' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' - cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - needs_subscription = True - oldest_article = 2 - max_articles_per_feed = 20 - language = 'ja' - remove_javascript = False - temp_files = [] - - remove_tags_before = {'class':"cmn-section cmn-indent"} - remove_tags = [ - {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, - {'class':"cmn-article_keyword cmn-clearfix"}, - {'class':"cmn-print_headline cmn-clearfix"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - - feeds = [ - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba') - ] - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - - cj = mechanize.LWPCookieJar() - br.set_cookiejar(cj) - - #br.set_debug_http(True) - #br.set_debug_redirects(True) - #br.set_debug_responses(True) - - if self.username is not None and self.password is not None: - #print "----------------------------get login form--------------------------------------------" - # open login form - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') - response = br.response() - #print "----------------------------get login form---------------------------------------------" - #print "----------------------------set login form---------------------------------------------" - # remove disabled input which brings error on mechanize - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True - br.submit() - br.response() - #print "----------------------------send login form---------------------------------------------" - #print "----------------------------open news main page-----------------------------------------" - # open news site - br.open('http://www.nikkei.com/') - br.response() - #print "----------------------------www.nikkei.com BODY --------------------------------------" - #print response2.get_data() - #print "-------------------------^^-got auto redirect form----^^--------------------------------" - # forced redirect in default - br.select_form(nr=0) - br.submit() - response3 = br.response() - # return some cookie which should be set by Javascript - #print response3.geturl() - raw = response3.get_data() - #print "---------------------------response to form --------------------------------------------" - # grab cookie from JS and set it - redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) - br.select_form(nr=0) - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write("#LWP-Cookies-2.0\n") - - self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].close() - cj.load(self.temp_files[-1].name) - - br.submit() - - #br.set_debug_http(False) - #br.set_debug_redirects(False) - #br.set_debug_responses(False) - return br - - - - diff --git a/recipes/uninohimitu.recipe b/recipes/uninohimitu.recipe index aac412744c..7740307142 100644 --- a/recipes/uninohimitu.recipe +++ b/recipes/uninohimitu.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' +__copyright__ = '2010,2014, Hiroshi Miura ' ''' http://ameblo.jp/sauta19/ ''' @@ -18,7 +18,7 @@ class UniNoHimituKichiBlog(BasicNewsRecipe): category = 'cat, pet, japan' language = 'ja' encoding = 'utf-8' - + keep_only_tags = [{'class':'entry_head'},{'class':'subContentsInner'}] feeds = [(u'blog', u'http://feedblog.ameba.jp/rss/ameblo/sauta19/rss20.xml')] def parse_feeds(self): From 4bb290657d7144586fb47f3070f7854b6bacd1c3 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 6 May 2014 23:16:13 +0900 Subject: [PATCH 2/2] recipe: update yomiuri onlie world Signed-off-by: Hiroshi Miura --- recipes/yomiuri.recipe | 43 +++++++++++++--------------- recipes/yomiuri_world.recipe | 54 ++++++++++++++++++++++-------------- 2 files changed, 53 insertions(+), 44 deletions(-) diff --git a/recipes/yomiuri.recipe b/recipes/yomiuri.recipe index fb17bb1210..ef9d86e18a 100644 --- a/recipes/yomiuri.recipe +++ b/recipes/yomiuri.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' +__copyright__ = '2010,2014, Hiroshi Miura ' ''' www.yomiuri.co.jp ''' @@ -16,16 +16,13 @@ class YOLNews(BasicNewsRecipe): publisher = 'Yomiuri Online News' category = 'news, japan' language = 'ja' - encoding = 'Shift_JIS' + encoding = 'UTF-8' index = 'http://www.yomiuri.co.jp/latestnews/' remove_javascript = True masthead_title = u'YOMIURI ONLINE' - keep_only_tags = [{'class':"article-def"}] - remove_tags = [{'class':"RelatedArticle"}, - {'class':"sbtns"} - ] - remove_tags_after = {'class':"date-def"} + + keep_only_tags = [{'class':"article text-resizeable"}] def parse_feeds(self): feeds = BasicNewsRecipe.parse_feeds(self) @@ -42,22 +39,22 @@ class YOLNews(BasicNewsRecipe): def parse_index(self): feeds = [] + newsarticles = [] soup = self.index_to_soup(self.index) - topstories = soup.find('ul',attrs={'class':'list-def'}) - if topstories: - newsarticles = [] - for itt in topstories.findAll('li'): - itema = itt.find('a',href=True) - if itema: - itd1 = itema.findNextSibling(text = True) - itd2 = itd1.findNextSibling(text = True) - itd3 = itd2.findNextSibling(text = True) - newsarticles.append({ - 'title' :itema.string - ,'date' :''.join([itd1, itd2, itd3]) - ,'url' :'http://www.yomiuri.co.jp' + itema['href'] - ,'description':'' - }) - feeds.append(('latest', newsarticles)) + listlatest = soup.find('ul', attrs={'class':'list-common list-common-latest'}) + if listlatest: + for itt in listlatest.findAll('li'): + itema = itt.find('a',href=True) + if itema: + item_headline = itema.find('span',attrs={'class':'headline'}) + item_date = item_headline.find('span',attrs={'class':'update'}) + newsarticles.append({ + 'title' :item_headline.contents[0] + ,'date' :item_date + ,'url' :itema['href'] + ,'description':'' + }) + feeds.append(('latest', newsarticles)) return feeds + diff --git a/recipes/yomiuri_world.recipe b/recipes/yomiuri_world.recipe index 41ee4fd23d..d7570d4753 100644 --- a/recipes/yomiuri_world.recipe +++ b/recipes/yomiuri_world.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' +__copyright__ = '2010,2014, Hiroshi Miura ' ''' www.yomiuri.co.jp ''' @@ -16,16 +16,12 @@ class YOLNews(BasicNewsRecipe): publisher = 'Yomiuri Online News' category = 'news, japan' language = 'ja' - encoding = 'Shift_JIS' + encoding = 'UTF-8' index = 'http://www.yomiuri.co.jp/world/' remove_javascript = True masthead_title = u"YOMIURI ONLINE" - keep_only_tags = [{'class':"article-def"}] - remove_tags = [{'class':"RelatedArticle"}, - {'class':"sbtns"} - ] - remove_tags_after = {'class':"date-def"} + keep_only_tags = [{'class':"article text-resizeable"}] def parse_feeds(self): feeds = BasicNewsRecipe.parse_feeds(self) @@ -42,20 +38,36 @@ class YOLNews(BasicNewsRecipe): def parse_index(self): feeds = [] + newsarticles = [] soup = self.index_to_soup(self.index) - topstories = soup.find('ul',attrs={'class':'list-def'}) - if topstories: - newsarticles = [] - for itt in topstories.findAll('li'): - itema = itt.find('a',href=True) - if itema: - itd1 = itema.findNextSibling(text = True) - newsarticles.append({ - 'title' :itema.string - ,'date' :''.join([itd1]) - ,'url' :'http://www.yomiuri.co.jp' + itema['href'] - ,'description':'' - }) - feeds.append(('World', newsarticles)) + mainspan = soup.find('div', attrs={'class':'pbNested span-main-inr'}) + if mainspan: + topstories = mainspan.find('ul',attrs={'class':'list-top'}) + if topstories: + for itt in topstories.findAll('li'): + itema = itt.find('a',href=True) + if itema: + item_headline = itema.find('span',attrs={'class':'headline'}) + item_date = item_headline.find('span',attrs={'class':'update'}) + newsarticles.append({ + 'title' :item_headline.contents[0] + ,'date' :item_date + ,'url' :itema['href'] + ,'description':'' + }) + secondstories = mainspan.find('ul', attrs={'class':'list-common'}) + if secondstories: + for itt in secondstories.findAll('li'): + itema = itt.find('a',href=True) + if itema: + item_headline = itema.find('span',attrs={'class':'headline'}) + item_date = item_headline.find('span',attrs={'class':'update'}) + newsarticles.append({ + 'title' :item_headline.contents[0] + ,'date' :item_date + ,'url' :itema['href'] + ,'description':'' + }) + feeds.append(('World', newsarticles)) return feeds