Nikkei Social news by Hiroshi Miura

This commit is contained in:
Kovid Goyal 2010-12-06 12:47:54 -07:00
commit 91cdd30620
6 changed files with 136 additions and 7 deletions

View File

@ -4,6 +4,7 @@ __copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
www.mainichi.jp www.mainichi.jp
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class MainichiDailyNews(BasicNewsRecipe): class MainichiDailyNews(BasicNewsRecipe):
@ -22,3 +23,18 @@ class MainichiDailyNews(BasicNewsRecipe):
remove_tags = [{'class':"RelatedArticle"}] remove_tags = [{'class':"RelatedArticle"}]
remove_tags_after = {'class':"Credit"} remove_tags_after = {'class':"Credit"}
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if re.search(r'pheedo.jp', curarticle.url):
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
return feeds

View File

@ -14,5 +14,19 @@ class MainichiDailyITNews(BasicNewsRecipe):
remove_tags_before = {'class':"NewsTitle"} remove_tags_before = {'class':"NewsTitle"}
remove_tags = [{'class':"RelatedArticle"}] remove_tags = [{'class':"RelatedArticle"}]
remove_tags_after = {'class':"Credit"}
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if re.search(r'pheedo.jp', curarticle.url):
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
return feeds remove_tags_after = {'class':"Credit"}

View File

@ -32,12 +32,9 @@ class NikkeiNet_sub_life(BasicNewsRecipe):
remove_tags_after = {'class':"cmn-pr_list"} remove_tags_after = {'class':"cmn-pr_list"}
feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special')
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking')
] ]
def get_browser(self): def get_browser(self):

View File

@ -0,0 +1,102 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
'''
www.nikkei.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
import mechanize
from calibre.ptempfile import PersistentTemporaryFile
class NikkeiNet_sub_life(BasicNewsRecipe):
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)'
__author__ = 'Hiroshi Miura'
description = 'News and current market affairs from Japan'
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
needs_subscription = True
oldest_article = 2
max_articles_per_feed = 20
language = 'ja'
remove_javascript = False
temp_files = []
remove_tags_before = {'class':"cmn-section cmn-indent"}
remove_tags = [
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
{'class':"cmn-article_keyword cmn-clearfix"},
{'class':"cmn-print_headline cmn-clearfix"},
]
remove_tags_after = {'class':"cmn-pr_list"}
feeds = [
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
cj = mechanize.LWPCookieJar()
br.set_cookiejar(cj)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
if self.username is not None and self.password is not None:
#print "----------------------------get login form--------------------------------------------"
# open login form
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
response = br.response()
#print "----------------------------get login form---------------------------------------------"
#print "----------------------------set login form---------------------------------------------"
# remove disabled input which brings error on mechanize
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
br.set_response(response)
br.select_form(name='LA0010Form01')
br['LA0010Form01:LA0010Email'] = self.username
br['LA0010Form01:LA0010Password'] = self.password
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
br.submit()
br.response()
#print "----------------------------send login form---------------------------------------------"
#print "----------------------------open news main page-----------------------------------------"
# open news site
br.open('http://www.nikkei.com/')
br.response()
#print "----------------------------www.nikkei.com BODY --------------------------------------"
#print response2.get_data()
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
# forced redirect in default
br.select_form(nr=0)
br.submit()
response3 = br.response()
# return some cookie which should be set by Javascript
#print response3.geturl()
raw = response3.get_data()
#print "---------------------------response to form --------------------------------------------"
# grab cookie from JS and set it
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
br.select_form(nr=0)
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
self.temp_files[-1].close()
cj.load(self.temp_files[-1].name)
br.submit()
#br.set_debug_http(False)
#br.set_debug_redirects(False)
#br.set_debug_responses(False)
return br

View File

@ -21,7 +21,7 @@ class YOLNews(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
masthead_title = u'YOMIURI ONLINE' masthead_title = u'YOMIURI ONLINE'
remove_tags_before = {'class':"article-def"} keep_only_tags = [{'class':"article-def"}]
remove_tags = [{'class':"RelatedArticle"}, remove_tags = [{'class':"RelatedArticle"},
{'class':"sbtns"} {'class':"sbtns"}
] ]

View File

@ -21,7 +21,7 @@ class YOLNews(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
masthead_title = u"YOMIURI ONLINE" masthead_title = u"YOMIURI ONLINE"
remove_tags_before = {'class':"article-def"} keep_only_tags = [{'class':"article-def"}]
remove_tags = [{'class':"RelatedArticle"}, remove_tags = [{'class':"RelatedArticle"},
{'class':"sbtns"} {'class':"sbtns"}
] ]