From 940d3fd6a6a3fd9dcb957753541606c6da08aa20 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 7 Nov 2010 08:43:56 +0900 Subject: [PATCH 01/18] Introduce several news recipes - nikkei * free subscribe version works * subscriber version need more work - CNet/Japan - The H --- resources/recipes/cnet.recipe | 30 +++++++++ resources/recipes/nikkei_free.recipe | 48 +++++++++++++++ resources/recipes/nikkei_sports_sub.recipe | 48 +++++++++++++++ resources/recipes/nikkei_sub.recipe | 71 ++++++++++++++++++++++ resources/recipes/the_h.recipe | 23 +++++++ 5 files changed, 220 insertions(+) create mode 100644 resources/recipes/cnet.recipe create mode 100644 resources/recipes/nikkei_free.recipe create mode 100644 resources/recipes/nikkei_sports_sub.recipe create mode 100644 resources/recipes/nikkei_sub.recipe create mode 100644 resources/recipes/the_h.recipe diff --git a/resources/recipes/cnet.recipe b/resources/recipes/cnet.recipe new file mode 100644 index 0000000000..8f81630762 --- /dev/null +++ b/resources/recipes/cnet.recipe @@ -0,0 +1,30 @@ +import re; + +class AdvancedUserRecipe1287588358(BasicNewsRecipe): + title = u'CNET Japan' + oldest_article = 3 + max_articles_per_feed = 30 + + feeds = [(u'cnet rss', u'http://feeds.japan.cnet.com/cnet/rss')] + language = 'ja' + encoding = 'Shift_JIS' + remove_javascript = True + + preprocess_regexps = [ + (re.compile(ur'.*', re.DOTALL|re.IGNORECASE|re.UNICODE), + lambda match: ''), + (re.compile(r'.*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(ur'.*', re.UNICODE), + lambda match: ''), + ] + + remove_tags_before = dict(name="h2") + remove_tags = [ + {'class':"social_bkm_share"}, + {'class':"social_bkm_print"}, + {'class':"block20 clearfix"}, + dict(name="div",attrs={'id':'bookreview'}), + ] + remove_tags_after = {'class':"block20"} + diff --git a/resources/recipes/nikkei_free.recipe b/resources/recipes/nikkei_free.recipe new file mode 100644 index 0000000000..f3a51e2fd6 --- /dev/null +++ b/resources/recipes/nikkei_free.recipe @@ -0,0 +1,48 @@ +class AdvancedUserRecipe1287958571(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Free)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'), + (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'), + (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'), + (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'), + (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research') + ] + + remove_tags_before = dict(id="CONTENTS") + remove_tags = [ + dict(name="form"), + {'class':"cmn-hide"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + diff --git a/resources/recipes/nikkei_sports_sub.recipe b/resources/recipes/nikkei_sports_sub.recipe new file mode 100644 index 0000000000..350a749dc6 --- /dev/null +++ b/resources/recipes/nikkei_sports_sub.recipe @@ -0,0 +1,48 @@ +import string, re, sys +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe + +class NikkeiNet_subscription(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 3 + max_articles_per_feed = 20 + language = 'ja' + + feeds = [ + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba') + ] + + remove_tags_before = dict(id="CONTENTS") + remove_tags = [ + dict(name="form"), + {'class':"cmn-hide"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + recursions = 4 + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam?cid=123456&flashId=654321') + assert br.viewing_html() + print br.title() + response = br.response() + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + res = br.submit() + raw = res.read() + if '日経IDのサービス一覧へ' not in raw: + raise ValueError('Failed to log in to nikkei.net, check your username(email address) and password') + return br diff --git a/resources/recipes/nikkei_sub.recipe b/resources/recipes/nikkei_sub.recipe new file mode 100644 index 0000000000..8eab6ec328 --- /dev/null +++ b/resources/recipes/nikkei_sub.recipe @@ -0,0 +1,71 @@ +import string, re, sys +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe + +class NikkeiNet_subscription(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + recursions = 3 + remove_javascript = False + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'), + (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'), + (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'), + (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'), + (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research') + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + res = br.submit() + raw = res.read() + if '日経IDのサービス一覧へ' not in raw: + raise ValueError('Failed to log in to nikkei.net, check your username(email address) and password') + br.open('http://www.nikkei.com/') + br.select_form(nr=0) + res = br.submit() + print res.read() + return br + + + diff --git a/resources/recipes/the_h.recipe b/resources/recipes/the_h.recipe new file mode 100644 index 0000000000..c25a39ca99 --- /dev/null +++ b/resources/recipes/the_h.recipe @@ -0,0 +1,23 @@ +class AdvancedUserRecipe1289003166(BasicNewsRecipe): + title = u'The H' + __author__ = 'Hiroshi Miura' + oldest_article = 3 + description = 'In association with Heise Online' + publisher = 'Heise Media UK Ltd.' + category = 'news, technology, security' + max_articles_per_feed = 100 + language = 'en' + encoding = 'utf-8' + conversion_options = { + 'comment' : description + ,'tags' : category + ,'publisher': publisher + ,'language' : language + } + feeds = [ + (u'The H News Feed', u'http://www.h-online.com/news/atom.xml') + ] + + def print_version(self, url): + return url + '?view=print' + From e9dec414a78d5d5c7a383130ce6d183953877075 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 9 Nov 2010 07:48:06 +0900 Subject: [PATCH 02/18] introduce mainichi news paper and cnet japan renames --- .../recipes/{cnet.recipe => cnetjapan.recipe} | 2 +- resources/recipes/mainichi.recipe | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) rename resources/recipes/{cnet.recipe => cnetjapan.recipe} (95%) create mode 100644 resources/recipes/mainichi.recipe diff --git a/resources/recipes/cnet.recipe b/resources/recipes/cnetjapan.recipe similarity index 95% rename from resources/recipes/cnet.recipe rename to resources/recipes/cnetjapan.recipe index 8f81630762..03a4c2ce2d 100644 --- a/resources/recipes/cnet.recipe +++ b/resources/recipes/cnetjapan.recipe @@ -1,6 +1,6 @@ import re; -class AdvancedUserRecipe1287588358(BasicNewsRecipe): +class CNetJapan(BasicNewsRecipe): title = u'CNET Japan' oldest_article = 3 max_articles_per_feed = 30 diff --git a/resources/recipes/mainichi.recipe b/resources/recipes/mainichi.recipe new file mode 100644 index 0000000000..3653c2e252 --- /dev/null +++ b/resources/recipes/mainichi.recipe @@ -0,0 +1,16 @@ +class MainichiDailyNews(BasicNewsRecipe): + title = u'Mainichi News' + __author__ = 'Hiroshi Miura' + oldest_article = 2 + max_articles_per_feed = 100 + description = 'Japanese traditional newspaper Mainichi Daily News' + publisher = 'Mainichi Daily News' + category = 'news, japan' + language = 'ja' + + feeds = [(u'mainichi IT', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')] + + remove_tags_before = {'class':"NewsTitle"} + remove_tags = [{'class':"RelatedArticle"}] + remove_tags_after = {'class':"ArticleLower"} + From b6403e541544c6ac656f80ba8921a8a5a08a45be Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 21 Nov 2010 20:36:22 +0900 Subject: [PATCH 03/18] introduce several categories for nikkei news --- resources/recipes/nikkei_sports_sub.recipe | 99 +++++++++++----- resources/recipes/nikkei_sub.recipe | 112 +++++++++++-------- resources/recipes/nikkei_sub_economy.recipe | 101 +++++++++++++++++ resources/recipes/nikkei_sub_industry.recipe | 99 ++++++++++++++++ resources/recipes/nikkei_sub_life.recipe | 100 +++++++++++++++++ resources/recipes/nikkei_sub_sports.recipe | 100 +++++++++++++++++ 6 files changed, 540 insertions(+), 71 deletions(-) create mode 100644 resources/recipes/nikkei_sub_economy.recipe create mode 100644 resources/recipes/nikkei_sub_industry.recipe create mode 100644 resources/recipes/nikkei_sub_life.recipe create mode 100644 resources/recipes/nikkei_sub_sports.recipe diff --git a/resources/recipes/nikkei_sports_sub.recipe b/resources/recipes/nikkei_sports_sub.recipe index 350a749dc6..c08404dac5 100644 --- a/resources/recipes/nikkei_sports_sub.recipe +++ b/resources/recipes/nikkei_sports_sub.recipe @@ -11,6 +11,79 @@ class NikkeiNet_subscription(BasicNewsRecipe): max_articles_per_feed = 20 language = 'ja' + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + response1 = br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + response2 = br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + feeds = [ (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), @@ -20,29 +93,3 @@ class NikkeiNet_subscription(BasicNewsRecipe): (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba') ] - remove_tags_before = dict(id="CONTENTS") - remove_tags = [ - dict(name="form"), - {'class':"cmn-hide"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - recursions = 4 - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam?cid=123456&flashId=654321') - assert br.viewing_html() - print br.title() - response = br.response() - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - res = br.submit() - raw = res.read() - if '日経IDのサービス一覧へ' not in raw: - raise ValueError('Failed to log in to nikkei.net, check your username(email address) and password') - return br diff --git a/resources/recipes/nikkei_sub.recipe b/resources/recipes/nikkei_sub.recipe index 8eab6ec328..fda9c828e5 100644 --- a/resources/recipes/nikkei_sub.recipe +++ b/resources/recipes/nikkei_sub.recipe @@ -1,71 +1,93 @@ import string, re, sys from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile -class NikkeiNet_subscription(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' + +class NikkeiNet_sub_main(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' needs_subscription = True - oldest_article = 2 + oldest_article = 2 max_articles_per_feed = 20 - language = 'ja' - recursions = 3 + language = 'ja' remove_javascript = False + temp_files = [] - feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), - (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), - (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), - (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), - (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), - (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), - (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), - (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), - (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), - (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), - (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), - (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), - (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), - (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), - (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), - (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), - (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), - (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), - (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), - (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'), - (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'), - (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'), - (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'), - (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'), - (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research') - ] + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + feeds = [ (u'NIKKEI', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=main')] + def get_browser(self): br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize response.set_data(response.get_data().replace("", " -->")) br.set_response(response) br.select_form(name='LA0010Form01') br['LA0010Form01:LA0010Email'] = self.username br['LA0010Form01:LA0010Password'] = self.password - res = br.submit() - raw = res.read() - if '日経IDのサービス一覧へ' not in raw: - raise ValueError('Failed to log in to nikkei.net, check your username(email address) and password') + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + response1 = br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site br.open('http://www.nikkei.com/') + response2 = br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default br.select_form(nr=0) - res = br.submit() - print res.read() + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) return br + diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe new file mode 100644 index 0000000000..aa32467c51 --- /dev/null +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -0,0 +1,101 @@ +import string, re, sys +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_economy(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + response1 = br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + response2 = br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/nikkei_sub_industry.recipe b/resources/recipes/nikkei_sub_industry.recipe new file mode 100644 index 0000000000..682a237b33 --- /dev/null +++ b/resources/recipes/nikkei_sub_industry.recipe @@ -0,0 +1,99 @@ +import string, re, sys +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_industory(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + response1 = br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + response2 = br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe new file mode 100644 index 0000000000..aff88db851 --- /dev/null +++ b/resources/recipes/nikkei_sub_life.recipe @@ -0,0 +1,100 @@ +import string, re, sys +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_life(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking') + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + response1 = br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + response2 = br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/nikkei_sub_sports.recipe b/resources/recipes/nikkei_sub_sports.recipe new file mode 100644 index 0000000000..74a47e9223 --- /dev/null +++ b/resources/recipes/nikkei_sub_sports.recipe @@ -0,0 +1,100 @@ +import string, re, sys +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_sports(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba') + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + response1 = br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + response2 = br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + From 439c1548c2c20533d1f0e4dac7fdd985e11ab954 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 21 Nov 2010 20:49:38 +0900 Subject: [PATCH 04/18] add more japanese news source - Mainichi Daily news IT and electoronics - Endgadget Japan edition --- resources/recipes/endgadget_ja.recipe | 18 ++++++++++++++++++ resources/recipes/mainichi.recipe | 12 ++++++++++-- resources/recipes/mainichi_it_news.recipe | 16 ++++++++++++++++ resources/recipes/nikkei_sub.recipe | 8 ++++++++ resources/recipes/nikkei_sub_economy.recipe | 8 ++++++++ resources/recipes/nikkei_sub_industry.recipe | 8 ++++++++ resources/recipes/nikkei_sub_life.recipe | 8 ++++++++ resources/recipes/nikkei_sub_sports.recipe | 8 ++++++++ 8 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 resources/recipes/endgadget_ja.recipe create mode 100644 resources/recipes/mainichi_it_news.recipe diff --git a/resources/recipes/endgadget_ja.recipe b/resources/recipes/endgadget_ja.recipe new file mode 100644 index 0000000000..f887b5f5ad --- /dev/null +++ b/resources/recipes/endgadget_ja.recipe @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +japan.engadget.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class EndgadgetJapan(BasicNewsRecipe): + title = u'Engadget\u65e5\u672c\u7248' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + language = _('Japanese') + encoding = 'utf-8' + feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')] diff --git a/resources/recipes/mainichi.recipe b/resources/recipes/mainichi.recipe index 3653c2e252..510c03a333 100644 --- a/resources/recipes/mainichi.recipe +++ b/resources/recipes/mainichi.recipe @@ -1,14 +1,22 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.mainichi.jp +''' + class MainichiDailyNews(BasicNewsRecipe): title = u'Mainichi News' __author__ = 'Hiroshi Miura' oldest_article = 2 - max_articles_per_feed = 100 + max_articles_per_feed = 20 description = 'Japanese traditional newspaper Mainichi Daily News' publisher = 'Mainichi Daily News' category = 'news, japan' language = 'ja' - feeds = [(u'mainichi IT', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')] + feeds = [(u'daily news', u'http://mainichi.jp/rss/etc/flash.rss')] remove_tags_before = {'class':"NewsTitle"} remove_tags = [{'class':"RelatedArticle"}] diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe new file mode 100644 index 0000000000..ce0b11f77e --- /dev/null +++ b/resources/recipes/mainichi_it_news.recipe @@ -0,0 +1,16 @@ +class MainichiDailyITNews(BasicNewsRecipe): + title = u'Mainichi News' + __author__ = 'Hiroshi Miura' + oldest_article = 2 + max_articles_per_feed = 100 + description = 'Japanese traditional newspaper Mainichi Daily News' + publisher = 'Mainichi Daily News' + category = 'news, japan' + language = 'ja' + + feeds = [(u'IT News', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')] + + remove_tags_before = {'class':"NewsTitle"} + remove_tags = [{'class':"RelatedArticle"}] + remove_tags_after = {'class':"ArticleLower"} + diff --git a/resources/recipes/nikkei_sub.recipe b/resources/recipes/nikkei_sub.recipe index fda9c828e5..9c5576389d 100644 --- a/resources/recipes/nikkei_sub.recipe +++ b/resources/recipes/nikkei_sub.recipe @@ -1,3 +1,11 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + import string, re, sys from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe index aa32467c51..0ec21f5571 100644 --- a/resources/recipes/nikkei_sub_economy.recipe +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -1,3 +1,11 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + import string, re, sys from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe diff --git a/resources/recipes/nikkei_sub_industry.recipe b/resources/recipes/nikkei_sub_industry.recipe index 682a237b33..5c8a7adb70 100644 --- a/resources/recipes/nikkei_sub_industry.recipe +++ b/resources/recipes/nikkei_sub_industry.recipe @@ -1,3 +1,11 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + import string, re, sys from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe index aff88db851..00fe053ac9 100644 --- a/resources/recipes/nikkei_sub_life.recipe +++ b/resources/recipes/nikkei_sub_life.recipe @@ -1,3 +1,11 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + import string, re, sys from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe diff --git a/resources/recipes/nikkei_sub_sports.recipe b/resources/recipes/nikkei_sub_sports.recipe index 74a47e9223..38f843bbbe 100644 --- a/resources/recipes/nikkei_sub_sports.recipe +++ b/resources/recipes/nikkei_sub_sports.recipe @@ -1,3 +1,11 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + import string, re, sys from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe From bedcdac5fb3551f1d5a9e0c9b2367a43f0ea085f Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 21 Nov 2010 21:33:44 +0900 Subject: [PATCH 05/18] add reuters japan recipe fix recipe names - nikkei news - mainichi news --- resources/recipes/mainichi.recipe | 2 +- resources/recipes/mainichi_it_news.recipe | 2 +- resources/recipes/nikkei_sub_economy.recipe | 2 +- resources/recipes/nikkei_sub_industry.recipe | 2 +- resources/recipes/nikkei_sub_life.recipe | 2 +- resources/recipes/reuters_ja.recipe | 25 ++++++++++++++++++++ 6 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 resources/recipes/reuters_ja.recipe diff --git a/resources/recipes/mainichi.recipe b/resources/recipes/mainichi.recipe index 510c03a333..552e81bef5 100644 --- a/resources/recipes/mainichi.recipe +++ b/resources/recipes/mainichi.recipe @@ -20,5 +20,5 @@ class MainichiDailyNews(BasicNewsRecipe): remove_tags_before = {'class':"NewsTitle"} remove_tags = [{'class':"RelatedArticle"}] - remove_tags_after = {'class':"ArticleLower"} + remove_tags_after = {'class':"Credit"} diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe index ce0b11f77e..49ab104bff 100644 --- a/resources/recipes/mainichi_it_news.recipe +++ b/resources/recipes/mainichi_it_news.recipe @@ -12,5 +12,5 @@ class MainichiDailyITNews(BasicNewsRecipe): remove_tags_before = {'class':"NewsTitle"} remove_tags = [{'class':"RelatedArticle"}] - remove_tags_after = {'class':"ArticleLower"} + remove_tags_after = {'class':"Credit"} diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe index 0ec21f5571..764ad7d008 100644 --- a/resources/recipes/nikkei_sub_economy.recipe +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_economy(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Economy)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True diff --git a/resources/recipes/nikkei_sub_industry.recipe b/resources/recipes/nikkei_sub_industry.recipe index 5c8a7adb70..cfcf92b805 100644 --- a/resources/recipes/nikkei_sub_industry.recipe +++ b/resources/recipes/nikkei_sub_industry.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_industory(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Industory)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe index 00fe053ac9..65a32eecac 100644 --- a/resources/recipes/nikkei_sub_life.recipe +++ b/resources/recipes/nikkei_sub_life.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_life(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Life)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True diff --git a/resources/recipes/reuters_ja.recipe b/resources/recipes/reuters_ja.recipe new file mode 100644 index 0000000000..d926c29096 --- /dev/null +++ b/resources/recipes/reuters_ja.recipe @@ -0,0 +1,25 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ReutersJa(BasicNewsRecipe): + + title = 'Reuters(Japan)' + description = 'Global news in Japanese' + __author__ = 'Hiroshi Miura' + use_embedded_content = False + language = 'ja' + max_articles_per_feed = 10 + remove_javascript = True + + feeds = [ ('Top Stories', 'http://feeds.reuters.com/reuters/JPTopNews?format=xml'), + ('World News', 'http://feeds.reuters.com/reuters/JPWorldNews?format=xml'), + ('Business News', 'http://feeds.reuters.com/reuters/JPBusinessNews?format=xml'), + ('Technology News', 'http://feeds.reuters.com/reuters/JPTechnologyNews?format=xml'), + ('Oddly Enough News', 'http://feeds.reuters.com/reuters/JPOddlyEnoughNews?format=xml') + ] + + remove_tags_before = {'class':"article primaryContent"} + remove_tags = [ dict(id="banner"), + dict(id="autilities"), + dict(id="textSizer") + ] + remove_tags_after = dict(id="copyrightNotice") From cee89baa64b72918c6b0281ea2399433aea63e1b Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 21 Nov 2010 22:28:20 +0900 Subject: [PATCH 06/18] add: recipe - add MSN Sankei News product releases - fix reuters_ja recipe to remove several tags --- resources/recipes/msnsankei.recipe | 22 ++++++++++++++++++++++ resources/recipes/reuters_ja.recipe | 16 ++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 resources/recipes/msnsankei.recipe diff --git a/resources/recipes/msnsankei.recipe b/resources/recipes/msnsankei.recipe new file mode 100644 index 0000000000..61ba0de6dc --- /dev/null +++ b/resources/recipes/msnsankei.recipe @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +sankei.jp.msn.com +''' + +class MSNSankeiNewsProduct(BasicNewsRecipe): + title = u'MSN\u7523\u7d4c\u30cb\u30e5\u30fc\u30b9(\u65b0\u5546\u54c1)' + __author__ = 'Hiroshi Miura' + description = 'Products release from Japan' + oldest_article = 7 + max_articles_per_feed = 100 + encoding = 'Shift_JIS' + language = 'ja' + + feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')] + + remove_tags_before = dict(id="__r_article_title__") + remove_tags_after = dict(id="ajax_release_news") + remove_tags = [{'class':"parent chromeCustom6G"}] diff --git a/resources/recipes/reuters_ja.recipe b/resources/recipes/reuters_ja.recipe index d926c29096..ffa084bc88 100644 --- a/resources/recipes/reuters_ja.recipe +++ b/resources/recipes/reuters_ja.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re class ReutersJa(BasicNewsRecipe): @@ -20,6 +21,17 @@ class ReutersJa(BasicNewsRecipe): remove_tags_before = {'class':"article primaryContent"} remove_tags = [ dict(id="banner"), dict(id="autilities"), - dict(id="textSizer") + dict(id="textSizer"), + dict(id="shareFooter"), + dict(id="relatedNews"), + dict(id="editorsChoice"), + dict(id="ecArticles"), + {'class':"secondaryContent"}, + {'class':"module"}, ] - remove_tags_after = dict(id="copyrightNotice") + remove_tags_after = {'class':"assetBuddy"} + + def print_version(self, url): + m = re.search('(.*idJPJAPAN-[0-9]+)', url) + return m.group(0)+'?sp=true' + From d002d73a5a10ebab2a872800b553b80514a4d987 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 21 Nov 2010 22:33:32 +0900 Subject: [PATCH 07/18] recipe: cosmetic work class name and copyright notice --- resources/recipes/nikkei_free.recipe | 10 +++++++++- resources/recipes/the_h.recipe | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/resources/recipes/nikkei_free.recipe b/resources/recipes/nikkei_free.recipe index f3a51e2fd6..3dfcee98b0 100644 --- a/resources/recipes/nikkei_free.recipe +++ b/resources/recipes/nikkei_free.recipe @@ -1,4 +1,12 @@ -class AdvancedUserRecipe1287958571(BasicNewsRecipe): +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + +class NikkeiNet(BasicNewsRecipe): title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Free)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' diff --git a/resources/recipes/the_h.recipe b/resources/recipes/the_h.recipe index c25a39ca99..7326efe9dd 100644 --- a/resources/recipes/the_h.recipe +++ b/resources/recipes/the_h.recipe @@ -1,4 +1,12 @@ -class AdvancedUserRecipe1289003166(BasicNewsRecipe): +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.h-online.com +''' + +class TheHeiseOnline(BasicNewsRecipe): title = u'The H' __author__ = 'Hiroshi Miura' oldest_article = 3 From 37a682a8a2ad48b1ba5b45435bd02f4e8d6f1a43 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Mon, 22 Nov 2010 09:09:36 +0900 Subject: [PATCH 08/18] recipes: japanese news fix titles and language code --- resources/recipes/endgadget_ja.recipe | 2 +- resources/recipes/mainichi.recipe | 2 +- resources/recipes/mainichi_it_news.recipe | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/recipes/endgadget_ja.recipe b/resources/recipes/endgadget_ja.recipe index f887b5f5ad..6e962931df 100644 --- a/resources/recipes/endgadget_ja.recipe +++ b/resources/recipes/endgadget_ja.recipe @@ -13,6 +13,6 @@ class EndgadgetJapan(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True - language = _('Japanese') + language = 'ja' encoding = 'utf-8' feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')] diff --git a/resources/recipes/mainichi.recipe b/resources/recipes/mainichi.recipe index 552e81bef5..c47018bcff 100644 --- a/resources/recipes/mainichi.recipe +++ b/resources/recipes/mainichi.recipe @@ -7,7 +7,7 @@ www.mainichi.jp ''' class MainichiDailyNews(BasicNewsRecipe): - title = u'Mainichi News' + title = u'Mainichi Daily News' __author__ = 'Hiroshi Miura' oldest_article = 2 max_articles_per_feed = 20 diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe index 49ab104bff..0397407087 100644 --- a/resources/recipes/mainichi_it_news.recipe +++ b/resources/recipes/mainichi_it_news.recipe @@ -1,5 +1,5 @@ class MainichiDailyITNews(BasicNewsRecipe): - title = u'Mainichi News' + title = u'Mainichi News - IT&Electronics' __author__ = 'Hiroshi Miura' oldest_article = 2 max_articles_per_feed = 100 From d88e0c5efaade44c2e0cfcb88dba6f9e2e26319a Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 23 Nov 2010 14:00:29 +0900 Subject: [PATCH 09/18] recipe: Japanese recipes: rename titles in Japanese --- resources/recipes/mainichi.recipe | 2 +- resources/recipes/mainichi_it_news.recipe | 6 +++--- resources/recipes/nikkei_sub_economy.recipe | 2 +- resources/recipes/nikkei_sub_industry.recipe | 2 +- resources/recipes/nikkei_sub_life.recipe | 2 +- resources/recipes/nikkei_sub_main.recipe | 2 +- resources/recipes/nikkei_sub_sports.recipe | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/resources/recipes/mainichi.recipe b/resources/recipes/mainichi.recipe index c47018bcff..2afe10b33c 100644 --- a/resources/recipes/mainichi.recipe +++ b/resources/recipes/mainichi.recipe @@ -7,7 +7,7 @@ www.mainichi.jp ''' class MainichiDailyNews(BasicNewsRecipe): - title = u'Mainichi Daily News' + title = u'\u6bce\u65e5\u65b0\u805e' __author__ = 'Hiroshi Miura' oldest_article = 2 max_articles_per_feed = 20 diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe index 0397407087..9cae836d35 100644 --- a/resources/recipes/mainichi_it_news.recipe +++ b/resources/recipes/mainichi_it_news.recipe @@ -1,11 +1,11 @@ class MainichiDailyITNews(BasicNewsRecipe): - title = u'Mainichi News - IT&Electronics' + title = u'\u6bce\u65e5\u65b0\u805e(IT&\u5bb6\u96fb)' __author__ = 'Hiroshi Miura' oldest_article = 2 max_articles_per_feed = 100 - description = 'Japanese traditional newspaper Mainichi Daily News' + description = 'Japanese traditional newspaper Mainichi Daily News - IT and electronics' publisher = 'Mainichi Daily News' - category = 'news, japan' + category = 'news, Japan, IT, Electronics' language = 'ja' feeds = [(u'IT News', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')] diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe index 764ad7d008..9de13ca674 100644 --- a/resources/recipes/nikkei_sub_economy.recipe +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_economy(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Economy)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True diff --git a/resources/recipes/nikkei_sub_industry.recipe b/resources/recipes/nikkei_sub_industry.recipe index cfcf92b805..c2dc9f82a0 100644 --- a/resources/recipes/nikkei_sub_industry.recipe +++ b/resources/recipes/nikkei_sub_industry.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_industory(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Industory)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe index 65a32eecac..6a37e4466a 100644 --- a/resources/recipes/nikkei_sub_life.recipe +++ b/resources/recipes/nikkei_sub_life.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_life(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Life)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True diff --git a/resources/recipes/nikkei_sub_main.recipe b/resources/recipes/nikkei_sub_main.recipe index 9c5576389d..021eeff64a 100644 --- a/resources/recipes/nikkei_sub_main.recipe +++ b/resources/recipes/nikkei_sub_main.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_main(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7dcf\u5408)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True diff --git a/resources/recipes/nikkei_sub_sports.recipe b/resources/recipes/nikkei_sub_sports.recipe index 38f843bbbe..e48ea17077 100644 --- a/resources/recipes/nikkei_sub_sports.recipe +++ b/resources/recipes/nikkei_sub_sports.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_sports(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Sports)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u30b9\u30dd\u30fc\u30c4)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' needs_subscription = True From e9775a5c548edebfe9fc0828bfaaacf144b65728 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 23 Nov 2010 15:24:23 +0900 Subject: [PATCH 10/18] recipe: introduce jiji press recipe: add masthead images for japanese news --- resources/recipes/endgadget_ja.recipe | 2 ++ resources/recipes/jijinews.recipe | 24 ++++++++++++++++++++ resources/recipes/nikkei_free.recipe | 2 ++ resources/recipes/nikkei_sub_economy.recipe | 2 ++ resources/recipes/nikkei_sub_industry.recipe | 2 ++ resources/recipes/nikkei_sub_life.recipe | 2 ++ resources/recipes/nikkei_sub_main.recipe | 2 ++ resources/recipes/nikkei_sub_sports.recipe | 2 ++ 8 files changed, 38 insertions(+) create mode 100644 resources/recipes/jijinews.recipe diff --git a/resources/recipes/endgadget_ja.recipe b/resources/recipes/endgadget_ja.recipe index 6e962931df..cf2ed69e94 100644 --- a/resources/recipes/endgadget_ja.recipe +++ b/resources/recipes/endgadget_ja.recipe @@ -10,6 +10,8 @@ from calibre.web.feeds.news import BasicNewsRecipe class EndgadgetJapan(BasicNewsRecipe): title = u'Engadget\u65e5\u672c\u7248' + cover_url = 'http://skins18.wincustomize.com/1/49/149320/29/7578/preview-29-7578.jpg' + masthead_url = 'http://www.blogsmithmedia.com/japanese.engadget.com/media/eng-jp-logo-t.png' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True diff --git a/resources/recipes/jijinews.recipe b/resources/recipes/jijinews.recipe new file mode 100644 index 0000000000..55f190726c --- /dev/null +++ b/resources/recipes/jijinews.recipe @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.jiji.com +''' + +class JijiDotCom(BasicNewsRecipe): + title = u'\u6642\u4e8b\u901a\u4fe1' + __author__ = 'Hiroshi Miura' + description = 'World News from Jiji Press' + publisher = 'Jiji Press Ltd.' + category = 'news' + encoding = 'utf-8' + oldest_article = 6 + max_articles_per_feed = 100 + language = 'ja' + cover_url = 'http://www.jiji.com/img/top_header_logo2.gif' + masthead_url = 'http://jen.jiji.com/images/logo_jijipress.gif' + + feeds = [(u'\u30cb\u30e5\u30fc\u30b9', u'http://www.jiji.com/rss/ranking.rdf')] + remove_tags_after = dict(id="ad_google") + diff --git a/resources/recipes/nikkei_free.recipe b/resources/recipes/nikkei_free.recipe index 3dfcee98b0..56aeb9d79d 100644 --- a/resources/recipes/nikkei_free.recipe +++ b/resources/recipes/nikkei_free.recipe @@ -10,6 +10,8 @@ class NikkeiNet(BasicNewsRecipe): title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Free)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' oldest_article = 2 max_articles_per_feed = 20 language = 'ja' diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe index 9de13ca674..8e107d15da 100644 --- a/resources/recipes/nikkei_sub_economy.recipe +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -17,6 +17,8 @@ class NikkeiNet_sub_economy(BasicNewsRecipe): title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' needs_subscription = True oldest_article = 2 max_articles_per_feed = 20 diff --git a/resources/recipes/nikkei_sub_industry.recipe b/resources/recipes/nikkei_sub_industry.recipe index c2dc9f82a0..67c2a4aa8b 100644 --- a/resources/recipes/nikkei_sub_industry.recipe +++ b/resources/recipes/nikkei_sub_industry.recipe @@ -17,6 +17,8 @@ class NikkeiNet_sub_industory(BasicNewsRecipe): title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' needs_subscription = True oldest_article = 2 max_articles_per_feed = 20 diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe index 6a37e4466a..da16f1694e 100644 --- a/resources/recipes/nikkei_sub_life.recipe +++ b/resources/recipes/nikkei_sub_life.recipe @@ -17,6 +17,8 @@ class NikkeiNet_sub_life(BasicNewsRecipe): title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' needs_subscription = True oldest_article = 2 max_articles_per_feed = 20 diff --git a/resources/recipes/nikkei_sub_main.recipe b/resources/recipes/nikkei_sub_main.recipe index 021eeff64a..695569958c 100644 --- a/resources/recipes/nikkei_sub_main.recipe +++ b/resources/recipes/nikkei_sub_main.recipe @@ -17,6 +17,8 @@ class NikkeiNet_sub_main(BasicNewsRecipe): title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7dcf\u5408)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' needs_subscription = True oldest_article = 2 max_articles_per_feed = 20 diff --git a/resources/recipes/nikkei_sub_sports.recipe b/resources/recipes/nikkei_sub_sports.recipe index e48ea17077..84336444df 100644 --- a/resources/recipes/nikkei_sub_sports.recipe +++ b/resources/recipes/nikkei_sub_sports.recipe @@ -17,6 +17,8 @@ class NikkeiNet_sub_sports(BasicNewsRecipe): title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u30b9\u30dd\u30fc\u30c4)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' needs_subscription = True oldest_article = 2 max_articles_per_feed = 20 From 5b7c73c4d985e813668ca3858c343ac9b8a5f343 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 23 Nov 2010 15:33:40 +0900 Subject: [PATCH 11/18] recipe: remove mis commited recipe --- resources/recipes/nikkei_sports_sub.recipe | 95 ---------------------- 1 file changed, 95 deletions(-) delete mode 100644 resources/recipes/nikkei_sports_sub.recipe diff --git a/resources/recipes/nikkei_sports_sub.recipe b/resources/recipes/nikkei_sports_sub.recipe deleted file mode 100644 index c08404dac5..0000000000 --- a/resources/recipes/nikkei_sports_sub.recipe +++ /dev/null @@ -1,95 +0,0 @@ -import string, re, sys -from calibre import strftime -from calibre.web.feeds.recipes import BasicNewsRecipe - -class NikkeiNet_subscription(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' - __author__ = 'Hiroshi Miura' - description = 'News and current market affairs from Japan' - needs_subscription = True - oldest_article = 3 - max_articles_per_feed = 20 - language = 'ja' - - remove_tags_before = {'class':"cmn-section cmn-indent"} - remove_tags = [ - {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, - {'class':"cmn-article_keyword cmn-clearfix"}, - {'class':"cmn-print_headline cmn-clearfix"}, - ] - remove_tags_after = {'class':"cmn-pr_list"} - - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - - cj = mechanize.LWPCookieJar() - br.set_cookiejar(cj) - - #br.set_debug_http(True) - #br.set_debug_redirects(True) - #br.set_debug_responses(True) - - if self.username is not None and self.password is not None: - #print "----------------------------get login form--------------------------------------------" - # open login form - br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') - response = br.response() - #print "----------------------------get login form---------------------------------------------" - #print "----------------------------set login form---------------------------------------------" - # remove disabled input which brings error on mechanize - response.set_data(response.get_data().replace("", " -->")) - br.set_response(response) - br.select_form(name='LA0010Form01') - br['LA0010Form01:LA0010Email'] = self.username - br['LA0010Form01:LA0010Password'] = self.password - br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True - br.submit() - response1 = br.response() - #print "----------------------------send login form---------------------------------------------" - #print "----------------------------open news main page-----------------------------------------" - # open news site - br.open('http://www.nikkei.com/') - response2 = br.response() - #print "----------------------------www.nikkei.com BODY --------------------------------------" - #print response2.get_data() - #print "-------------------------^^-got auto redirect form----^^--------------------------------" - # forced redirect in default - br.select_form(nr=0) - br.submit() - response3 = br.response() - # return some cookie which should be set by Javascript - #print response3.geturl() - raw = response3.get_data() - #print "---------------------------response to form --------------------------------------------" - # grab cookie from JS and set it - redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) - br.select_form(nr=0) - - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write("#LWP-Cookies-2.0\n") - - self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") - self.temp_files[-1].close() - cj.load(self.temp_files[-1].name) - - br.submit() - - #br.set_debug_http(False) - #br.set_debug_redirects(False) - #br.set_debug_responses(False) - return br - - - - feeds = [ - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), - (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba') - ] - From 479246931236f974f909f810a5321744ea1540f7 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 23 Nov 2010 15:40:06 +0900 Subject: [PATCH 12/18] recipe: introduce several icons for recipes --- resources/images/news/cnetjapan.png | Bin 0 -> 892 bytes resources/images/news/endgadget_ja.png | Bin 0 -> 698 bytes resources/images/news/jijinews.png | Bin 0 -> 919 bytes resources/images/news/msnsankei.png | Bin 0 -> 543 bytes resources/images/news/nikkei_free.png | Bin 0 -> 948 bytes resources/images/news/nikkei_sub_economy.png | Bin 0 -> 948 bytes resources/images/news/nikkei_sub_industory.png | Bin 0 -> 948 bytes resources/images/news/nikkei_sub_life.png | Bin 0 -> 948 bytes resources/images/news/nikkei_sub_main.png | Bin 0 -> 948 bytes resources/images/news/nikkei_sub_sports.png | Bin 0 -> 948 bytes resources/images/news/reuters.png | Bin 0 -> 693 bytes resources/images/news/reuters_ja.png | Bin 0 -> 693 bytes 12 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 resources/images/news/cnetjapan.png create mode 100644 resources/images/news/endgadget_ja.png create mode 100644 resources/images/news/jijinews.png create mode 100644 resources/images/news/msnsankei.png create mode 100644 resources/images/news/nikkei_free.png create mode 100644 resources/images/news/nikkei_sub_economy.png create mode 100644 resources/images/news/nikkei_sub_industory.png create mode 100644 resources/images/news/nikkei_sub_life.png create mode 100644 resources/images/news/nikkei_sub_main.png create mode 100644 resources/images/news/nikkei_sub_sports.png create mode 100644 resources/images/news/reuters.png create mode 100644 resources/images/news/reuters_ja.png diff --git a/resources/images/news/cnetjapan.png b/resources/images/news/cnetjapan.png new file mode 100644 index 0000000000000000000000000000000000000000..9a0dcc8f7fa7645db5f97ef0b5e2978133e05732 GIT binary patch literal 892 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87?_rNx;TbdoKBs5HhW6A$g%qG_pTQj9XoPS zgUd~N@st@t=De* zacN=G@pZC`V(gs?9`1fEy;#%tp}*Ix>$`Wk&56!=&G>S*>#pqU6K6%ir>>}4qV!Kd6U%X$;Qn!IYp}f_bzlv?~~pAUA{rt zJu!*>^=(atyg!Ri&9Y#8@Nc6qb6I?^7PF}Y?}fTwQok29zZPMbvi?zH+gkJgi}qH2 z=kl)+EuDBoKbpJ#(w^-OYu{YzZCk}#v*OXdy$mMe(Li*^OP;yFZHd*A^ux3DuN0N9 z+n3H?wZgID6OW>?VH@oX5I){wfOzj_lxQS|M!^K8!q~{E;65oOVqVR zpkU&$wn7D=tq3fJd+cw3`+^8Y@;2bVf#Zap-8>51olH?)FK#IZ0z`#=1&`8(FG{nf* z$^?ikbPdd{3=Fup)uf?l$jwj5OshoGU~FM!XaLc0!9Hs(Py>UftDnm{r-UW|`oMrT literal 0 HcmV?d00001 diff --git a/resources/images/news/endgadget_ja.png b/resources/images/news/endgadget_ja.png new file mode 100644 index 0000000000000000000000000000000000000000..94e8f1219c849379759e6479c74027c3428813fb GIT binary patch literal 698 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%b@U_I1CZGssfk$L9 zkoEv$x0Bg+Ai=T%$8;bK*3S@gZ0%tN2FAypE{-7WP_3o7xoZCMa9QgO<>o1+@ ztM+J%+h~+Jr zeB=&aT(0PfEnDlgSIpJBx#-a5ueoJWTHBwWEBKppN#@_ee@`9;^@_km@0}ZIj#}woLcnO|G+;Fr-J7-Qr#0OHX3F<-0Exaws7O!jEgV%`PdkK zvobQbGB(#Wu&^>P_-f3)6s93JKP5A*5>tb@6;P?E2}FbDmq;g|1_n=8KbLh* G2~7ag8zjE~ literal 0 HcmV?d00001 diff --git a/resources/images/news/jijinews.png b/resources/images/news/jijinews.png new file mode 100644 index 0000000000000000000000000000000000000000..b87865fc349dd9565c0a467a3f7197dd20d48f7a GIT binary patch literal 919 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87?=)tx;TbdoX(vbo#PWI(zd_$nUAmaj4gAQ zZ3#6w+o7Q~g-h@vyNQP|XJAXmjy53v;(BRE%aVy5E-m8IIlWw4*aZz=G;8Xv?P^O? z%WIzb>P}VKvomwvE-ak#YyP!g-}nEUI(6@~2lf%=2eswvUMS1?J1IH$nXFi=^XySy z*2neV^cVfRQ(S2}hasKsr&B=U)&!TyE0{eG**_}z{p;qwE3TK@W+u%z*fd%0W#Nql zCgMq++r)YoZ9BlX%aM<*U9bE>tvE|);@LePca)zOX5c!o&Rc)!@d*J2LWeljmhkqS z(@1iyw9{4-pWr1`vWK5{)zZe_Gy9d#&J}WCwRr!r`84N2izB6GXEhbiFPpV^Zq`eu z|1wOo8D=sjEd$%L^V~H5^P%_MuiFWe2EIBvV zm7`HaEN*?B>O|*)A94LLuj_25Osdx3cIuwPjWg_%c1cRl>&;R6#@1Ej~ z{)}_@cAwN-|K~4*g?rk@&CAZzWYk@0oOek>{}i*rTtkcM&xX;jR&CU%zveEp=exDpHFaf7GG2cR-2B*Na*XPe8zGC7`njv8EtXrqUL^H) zRLcA50!x%8a5%rv{=NPBtfLIR*X0B|k9;hvV^|qrSITFs_Kb`5!I5J#6Qmi4{BIJ3hbRm3wb}Vq5dx3CDc9Gb1d0d90V+zSqC$;N3oZfg@QN9DOxUU= zt`Q|Ei6yC4$wjF^iowXh&_LJFP}j&f#L(2r$lS`%LfgR7%D|xas-YK(hTQy=%(O~$ b4Q5t`rVtH9bDKGU8W=oX{an^LB{Ts5^bLmp literal 0 HcmV?d00001 diff --git a/resources/images/news/msnsankei.png b/resources/images/news/msnsankei.png new file mode 100644 index 0000000000000000000000000000000000000000..7e92af7b20bbcd14658e7c00707b9ffa9bc19f40 GIT binary patch literal 543 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61SBU+%rFB|oCO|{#X#BvjNMLV+kphj3LMjc zG*~r5%(1nH85kI&JY5_^G|o?*w6XB80gvm#J4-zUGniL-dFMOiD&!^ru|s|WkPU?S z4%@xBR(NaPw7mZ_h($BQxXWMj6StUx{D=3#p8VWaj5demUop^4}7xe*S)oE?W(MMC9{^TTD5CkGlz-q=G$){b?cmd<`6mUO=OmO{ zCIo3MeYHz%(aytld=c4<3VVXRET3e6z>KvsXLdLRX7e&!yH$N4 zD*Afqinu-hH%yy(ywK%{!E-qdxfwqvuV8<{cu(N`xz>ALD<8a4zqaCdv|ar1$18K4 zjVkB)Ulw^4p0G8F_ppJwyukdt`NbA8@9JlDv-OKzF)Iv?oBTvvK(>AB$xUl?PMi2X zzAE}-`aVX+J?04(af(ZUL9SZj8c~vxSdwa$T$Bo=7>o=I4Rj3+b&ZTe3{99nO2EKgDFJAX5p9oe`5NbF}_`?Nhax&jQyz zdn%Hiv2^LytJWG*r5riioPsX7?iRe#Yor~$O2b5N%S*R{hgty_YhA8zG?@gV43}NidsXG`|@pjZ5JyqUG&hi zCO)ju^BLFU@3)THKF|obGjns>=E}(`jY~8=g)6>S$OjvTraiZvrhRg|v|~JThV+LS zx!o@h8LvIeT*-WXa?OWJ40*nfuB#}YFZ*=vv~5Z5<(pqSB+PhCdxNEZ+cR|io*UK^ z))+n6=2~L$4~Mpm%Xe2A-EUjR&G#lxnM2ixSHu53qsBURi=~}&EwfK#HM_+MT@h{n zww?LIz1Mr6{EB)Pw^RTAsipn0a#aq?cPE!}EH_#B(kgwEWpmr|jH)h^kR>mz5@t$q zcl*Ao^ZHrlA)(18x9wt!+p?uRD~#RJm-|iPn$@Vn>A1|m`u)d@{YPIUh~JMb-MMYU zG5-B#8EZTl#Qt}A2=$9??s{SWYxAqqQgh^9K0I>a+wRx%{x51u<^x6y-$1*|5L#qCE)St_M6`c4cGhLCcJXuJuM((H{;>diR_W9j_mmxIMaaV z<6X8u<3k)wCm#q)Zo9Pb`0dKCe`mird-G0Ir^UBx^Id%9AJrSSc5m5r;gG|_^-H7K zbpFadUhi4r8&Dd&NP{WTb%m&juIf^8U(IuK|E9Fgntg|T{hDKQ{uhY`q*UwtDxdMX zU{}XEqe~TB0?$o=F&O6@cSe40df7U^v)wayKDzY#QD}hS3bk#zH@2U4QsK^cWmV;E zZg|cWsPx#e1D@Lz=gHfCwGDhBbd+fdn}(gOwa1JWiLfI#COYn$8L1}Z&Y)&K{UuAl zmb0gdcSR|5+APT_di_Z{W0#A*IV=)1N=*K52aoe`5NbF}_`?Nhax&jQyz zdn%Hiv2^LytJWG*r5riioPsX7?iRe#Yor~$O2b5N%S*R{hgty_YhA8zG?@gV43}NidsXG`|@pjZ5JyqUG&hi zCO)ju^BLFU@3)THKF|obGjns>=E}(`jY~8=g)6>S$OjvTraiZvrhRg|v|~JThV+LS zx!o@h8LvIeT*-WXa?OWJ40*nfuB#}YFZ*=vv~5Z5<(pqSB+PhCdxNEZ+cR|io*UK^ z))+n6=2~L$4~Mpm%Xe2A-EUjR&G#lxnM2ixSHu53qsBURi=~}&EwfK#HM_+MT@h{n zww?LIz1Mr6{EB)Pw^RTAsipn0a#aq?cPE!}EH_#B(kgwEWpmr|jH)h^kR>mz5@t$q zcl*Ao^ZHrlA)(18x9wt!+p?uRD~#RJm-|iPn$@Vn>A1|m`u)d@{YPIUh~JMb-MMYU zG5-B#8EZTl#Qt}A2=$9??s{SWYxAqqQgh^9K0I>a+wRx%{x51u<^x6y-$1*|5L#qCE)St_M6`c4cGhLCcJXuJuM((H{;>diR_W9j_mmxIMaaV z<6X8u<3k)wCm#q)Zo9Pb`0dKCe`mird-G0Ir^UBx^Id%9AJrSSc5m5r;gG|_^-H7K zbpFadUhi4r8&Dd&NP{WTb%m&juIf^8U(IuK|E9Fgntg|T{hDKQ{uhY`q*UwtDxdMX zU{}XEqe~TB0?$o=F&O6@cSe40df7U^v)wayKDzY#QD}hS3bk#zH@2U4QsK^cWmV;E zZg|cWsPx#e1D@Lz=gHfCwGDhBbd+fdn}(gOwa1JWiLfI#COYn$8L1}Z&Y)&K{UuAl zmb0gdcSR|5+APT_di_Z{W0#A*IV=)1N=*K52aoe`5NbF}_`?Nhax&jQyz zdn%Hiv2^LytJWG*r5riioPsX7?iRe#Yor~$O2b5N%S*R{hgty_YhA8zG?@gV43}NidsXG`|@pjZ5JyqUG&hi zCO)ju^BLFU@3)THKF|obGjns>=E}(`jY~8=g)6>S$OjvTraiZvrhRg|v|~JThV+LS zx!o@h8LvIeT*-WXa?OWJ40*nfuB#}YFZ*=vv~5Z5<(pqSB+PhCdxNEZ+cR|io*UK^ z))+n6=2~L$4~Mpm%Xe2A-EUjR&G#lxnM2ixSHu53qsBURi=~}&EwfK#HM_+MT@h{n zww?LIz1Mr6{EB)Pw^RTAsipn0a#aq?cPE!}EH_#B(kgwEWpmr|jH)h^kR>mz5@t$q zcl*Ao^ZHrlA)(18x9wt!+p?uRD~#RJm-|iPn$@Vn>A1|m`u)d@{YPIUh~JMb-MMYU zG5-B#8EZTl#Qt}A2=$9??s{SWYxAqqQgh^9K0I>a+wRx%{x51u<^x6y-$1*|5L#qCE)St_M6`c4cGhLCcJXuJuM((H{;>diR_W9j_mmxIMaaV z<6X8u<3k)wCm#q)Zo9Pb`0dKCe`mird-G0Ir^UBx^Id%9AJrSSc5m5r;gG|_^-H7K zbpFadUhi4r8&Dd&NP{WTb%m&juIf^8U(IuK|E9Fgntg|T{hDKQ{uhY`q*UwtDxdMX zU{}XEqe~TB0?$o=F&O6@cSe40df7U^v)wayKDzY#QD}hS3bk#zH@2U4QsK^cWmV;E zZg|cWsPx#e1D@Lz=gHfCwGDhBbd+fdn}(gOwa1JWiLfI#COYn$8L1}Z&Y)&K{UuAl zmb0gdcSR|5+APT_di_Z{W0#A*IV=)1N=*K52aoe`5NbF}_`?Nhax&jQyz zdn%Hiv2^LytJWG*r5riioPsX7?iRe#Yor~$O2b5N%S*R{hgty_YhA8zG?@gV43}NidsXG`|@pjZ5JyqUG&hi zCO)ju^BLFU@3)THKF|obGjns>=E}(`jY~8=g)6>S$OjvTraiZvrhRg|v|~JThV+LS zx!o@h8LvIeT*-WXa?OWJ40*nfuB#}YFZ*=vv~5Z5<(pqSB+PhCdxNEZ+cR|io*UK^ z))+n6=2~L$4~Mpm%Xe2A-EUjR&G#lxnM2ixSHu53qsBURi=~}&EwfK#HM_+MT@h{n zww?LIz1Mr6{EB)Pw^RTAsipn0a#aq?cPE!}EH_#B(kgwEWpmr|jH)h^kR>mz5@t$q zcl*Ao^ZHrlA)(18x9wt!+p?uRD~#RJm-|iPn$@Vn>A1|m`u)d@{YPIUh~JMb-MMYU zG5-B#8EZTl#Qt}A2=$9??s{SWYxAqqQgh^9K0I>a+wRx%{x51u<^x6y-$1*|5L#qCE)St_M6`c4cGhLCcJXuJuM((H{;>diR_W9j_mmxIMaaV z<6X8u<3k)wCm#q)Zo9Pb`0dKCe`mird-G0Ir^UBx^Id%9AJrSSc5m5r;gG|_^-H7K zbpFadUhi4r8&Dd&NP{WTb%m&juIf^8U(IuK|E9Fgntg|T{hDKQ{uhY`q*UwtDxdMX zU{}XEqe~TB0?$o=F&O6@cSe40df7U^v)wayKDzY#QD}hS3bk#zH@2U4QsK^cWmV;E zZg|cWsPx#e1D@Lz=gHfCwGDhBbd+fdn}(gOwa1JWiLfI#COYn$8L1}Z&Y)&K{UuAl zmb0gdcSR|5+APT_di_Z{W0#A*IV=)1N=*K52aoe`5NbF}_`?Nhax&jQyz zdn%Hiv2^LytJWG*r5riioPsX7?iRe#Yor~$O2b5N%S*R{hgty_YhA8zG?@gV43}NidsXG`|@pjZ5JyqUG&hi zCO)ju^BLFU@3)THKF|obGjns>=E}(`jY~8=g)6>S$OjvTraiZvrhRg|v|~JThV+LS zx!o@h8LvIeT*-WXa?OWJ40*nfuB#}YFZ*=vv~5Z5<(pqSB+PhCdxNEZ+cR|io*UK^ z))+n6=2~L$4~Mpm%Xe2A-EUjR&G#lxnM2ixSHu53qsBURi=~}&EwfK#HM_+MT@h{n zww?LIz1Mr6{EB)Pw^RTAsipn0a#aq?cPE!}EH_#B(kgwEWpmr|jH)h^kR>mz5@t$q zcl*Ao^ZHrlA)(18x9wt!+p?uRD~#RJm-|iPn$@Vn>A1|m`u)d@{YPIUh~JMb-MMYU zG5-B#8EZTl#Qt}A2=$9??s{SWYxAqqQgh^9K0I>a+wRx%{x51u<^x6y-$1*|5L#qCE)St_M6`c4cGhLCcJXuJuM((H{;>diR_W9j_mmxIMaaV z<6X8u<3k)wCm#q)Zo9Pb`0dKCe`mird-G0Ir^UBx^Id%9AJrSSc5m5r;gG|_^-H7K zbpFadUhi4r8&Dd&NP{WTb%m&juIf^8U(IuK|E9Fgntg|T{hDKQ{uhY`q*UwtDxdMX zU{}XEqe~TB0?$o=F&O6@cSe40df7U^v)wayKDzY#QD}hS3bk#zH@2U4QsK^cWmV;E zZg|cWsPx#e1D@Lz=gHfCwGDhBbd+fdn}(gOwa1JWiLfI#COYn$8L1}Z&Y)&K{UuAl zmb0gdcSR|5+APT_di_Z{W0#A*IV=)1N=*K52aoe`5NbF}_`?Nhax&jQyz zdn%Hiv2^LytJWG*r5riioPsX7?iRe#Yor~$O2b5N%S*R{hgty_YhA8zG?@gV43}NidsXG`|@pjZ5JyqUG&hi zCO)ju^BLFU@3)THKF|obGjns>=E}(`jY~8=g)6>S$OjvTraiZvrhRg|v|~JThV+LS zx!o@h8LvIeT*-WXa?OWJ40*nfuB#}YFZ*=vv~5Z5<(pqSB+PhCdxNEZ+cR|io*UK^ z))+n6=2~L$4~Mpm%Xe2A-EUjR&G#lxnM2ixSHu53qsBURi=~}&EwfK#HM_+MT@h{n zww?LIz1Mr6{EB)Pw^RTAsipn0a#aq?cPE!}EH_#B(kgwEWpmr|jH)h^kR>mz5@t$q zcl*Ao^ZHrlA)(18x9wt!+p?uRD~#RJm-|iPn$@Vn>A1|m`u)d@{YPIUh~JMb-MMYU zG5-B#8EZTl#Qt}A2=$9??s{SWYxAqqQgh^9K0I>a+wRx%{x51u<^x6y-$1*|5L#qCE)St_M6`c4cGhLCcJXuJuM((H{;>diR_W9j_mmxIMaaV z<6X8u<3k)wCm#q)Zo9Pb`0dKCe`mird-G0Ir^UBx^Id%9AJrSSc5m5r;gG|_^-H7K zbpFadUhi4r8&Dd&NP{WTb%m&juIf^8U(IuK|E9Fgntg|T{hDKQ{uhY`q*UwtDxdMX zU{}XEqe~TB0?$o=F&O6@cSe40df7U^v)wayKDzY#QD}hS3bk#zH@2U4QsK^cWmV;E zZg|cWsPx#e1D@Lz=gHfCwGDhBbd+fdn}(gOwa1JWiLfI#COYn$8L1}Z&Y)&K{UuAl zmb0gdcSR|5+APT_di_Z{W0#A*IV=)1N=*K52a3W(_)UssCo+&aOf?FrcPT2k^!L>`+OJ<|6&>~G44HuWodvv6egiMPc z%$j$tF8zP~_xJx|uYce7&Z&O$!gFRe$FgD$+08k?zDHMIhbLQob+-vWe$Q}@vwhgYg%|Fxk*#Kaqms2jsn_}gyYdI-%^&%07m9uM zmJnMn&R*mCt$L1Dv$atC2hPQYg1oL*k{u8Dcm|61P4|5sC$;Cm9gWJaQ%T1zP7Y6Z z3A`_GbHyS5J*PbC7cQI8UHW5&;I+rn&z>Z)Mg^St?ksak`MUQDz8PUT2c=4uT{O|* zIq{C+-H~%kHf@{IG0(4t|FM%zS)v%%j_&uFo7mGYw9RzA<5q1WceLuxkrhkwiWuw; zMVYj_O8;V86eIg9+R&6U$5(9L12(hvqnlP_gs?3$*T0tNEcUF(?W;{ebb*~`u8hRl z2f1qwl&taob&utm%KvGvQ=j~|uV=i&SE4&ne8PKRjH;HnMwFx^mZVxG7o`Fz1|tJQ z16@NyT_fWVLsKhbODjWjZ39ay1A`^E)?Gl+kei>9nO2EKgE2&d;t8XApaw~h4Z-3W(_)UssCo+&aOf?FrcPT2k^!L>`+OJ<|6&>~G44HuWodvv6egiMPc z%$j$tF8zP~_xJx|uYce7&Z&O$!gFRe$FgD$+08k?zDHMIhbLQob+-vWe$Q}@vwhgYg%|Fxk*#Kaqms2jsn_}gyYdI-%^&%07m9uM zmJnMn&R*mCt$L1Dv$atC2hPQYg1oL*k{u8Dcm|61P4|5sC$;Cm9gWJaQ%T1zP7Y6Z z3A`_GbHyS5J*PbC7cQI8UHW5&;I+rn&z>Z)Mg^St?ksak`MUQDz8PUT2c=4uT{O|* zIq{C+-H~%kHf@{IG0(4t|FM%zS)v%%j_&uFo7mGYw9RzA<5q1WceLuxkrhkwiWuw; zMVYj_O8;V86eIg9+R&6U$5(9L12(hvqnlP_gs?3$*T0tNEcUF(?W;{ebb*~`u8hRl z2f1qwl&taob&utm%KvGvQ=j~|uV=i&SE4&ne8PKRjH;HnMwFx^mZVxG7o`Fz1|tJQ z16@NyT_fWVLsKhbODjWjZ39ay1A`^E)?Gl+kei>9nO2EKgE2&d;t8XApaw~h4Z- Date: Tue, 23 Nov 2010 15:51:28 +0900 Subject: [PATCH 13/18] recipe: fix typo --- resources/recipes/nikkei_sub_economy.recipe | 2 +- resources/recipes/nikkei_sub_life.recipe | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe index 8e107d15da..834c1975ad 100644 --- a/resources/recipes/nikkei_sub_economy.recipe +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_economy(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7d4c\u6e08)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe index da16f1694e..16b7f99d4f 100644 --- a/resources/recipes/nikkei_sub_life.recipe +++ b/resources/recipes/nikkei_sub_life.recipe @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile class NikkeiNet_sub_life(BasicNewsRecipe): - title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)' __author__ = 'Hiroshi Miura' description = 'News and current market affairs from Japan' cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' From 9af8b9c322800ce90c77420b364b94aefbf1a6c1 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 23 Nov 2010 15:53:21 +0900 Subject: [PATCH 14/18] recipe: endgadget Japan: fix typo --- resources/recipes/endgadget_ja.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/recipes/endgadget_ja.recipe b/resources/recipes/endgadget_ja.recipe index cf2ed69e94..7d2a57ea05 100644 --- a/resources/recipes/endgadget_ja.recipe +++ b/resources/recipes/endgadget_ja.recipe @@ -9,7 +9,7 @@ japan.engadget.com from calibre.web.feeds.news import BasicNewsRecipe class EndgadgetJapan(BasicNewsRecipe): - title = u'Engadget\u65e5\u672c\u7248' + title = u'Endgadget\u65e5\u672c\u7248' cover_url = 'http://skins18.wincustomize.com/1/49/149320/29/7578/preview-29-7578.jpg' masthead_url = 'http://www.blogsmithmedia.com/japanese.engadget.com/media/eng-jp-logo-t.png' oldest_article = 7 From 3118a5dac32ccc994ac504be42dbcd7fdd29b860 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Nov 2010 08:26:06 -0700 Subject: [PATCH 15/18] ... --- resources/recipes/brand_eins.recipe | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe index c69dd693b2..71b6aa8cda 100644 --- a/resources/recipes/brand_eins.recipe +++ b/resources/recipes/brand_eins.recipe @@ -91,8 +91,8 @@ class BrandEins(BasicNewsRecipe): latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue] url = pre_latest_issue.get('href', False) - # Get the title for the magazin - build it out of the title of the cover - take the issue and year; - self.title = "brand eins "+ re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') + # Get month and year of the magazine issue - build it out of the title of the cover + self.timefmt = " " + re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') url = 'http://brandeins.de/'+url # url = "http://www.brandeins.de/archiv/magazin/tierisch.html" From eaeee277f0ae7cd0cf2331bbca68de1e31947233 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Nov 2010 08:49:19 -0700 Subject: [PATCH 16/18] Update Mingpao --- resources/recipes/ming_pao.recipe | 296 ++++++++++++++++++++++++++++-- 1 file changed, 284 insertions(+), 12 deletions(-) diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 162a3c774e..385dbdbdb7 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty Change Log: +2010/11/22: add English section, remove eco-news section which is not updated daily, correct + ordering of articles +2010/11/12: add news image and eco-news section +2010/11/08: add parsing of finance section +2010/11/06: temporary work-around for Kindle device having no capability to display unicode + in section/article list. 2010/10/31: skip repeated articles in section pages ''' -import datetime +import os, datetime, re from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested -class AdvancedUserRecipe1278063072(BasicNewsRecipe): + +from calibre import __appname__, strftime +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation +from calibre.utils.date import now as nowf + +class MPHKRecipe(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 @@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}' + #extra_css = 'img {float:right; margin:4px;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), + #dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page + dict(attrs={'class':['photo']}), + dict(attrs={'id':['newscontent']}), dict(attrs={'id':['newscontent01','newscontent02']})] + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']})] # for the finance page + remove_attributes = ['width'] + preprocess_regexps = [ + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + def image_url_processor(cls, baseurl, url): + # trick: break the url at the first occurance of digit, add an additional + # '_' at the front + # not working, may need to move this to preprocess_html() method + #minIdx = 10000 + #i0 = url.find('0') + #if i0 >= 0 and i0 < minIdx: + # minIdx = i0 + #i1 = url.find('1') + #if i1 >= 0 and i1 < minIdx: + # minIdx = i1 + #i2 = url.find('2') + #if i2 >= 0 and i2 < minIdx: + # minIdx = i2 + #i3 = url.find('3') + #if i3 >= 0 and i0 < minIdx: + # minIdx = i3 + #i4 = url.find('4') + #if i4 >= 0 and i4 < minIdx: + # minIdx = i4 + #i5 = url.find('5') + #if i5 >= 0 and i5 < minIdx: + # minIdx = i5 + #i6 = url.find('6') + #if i6 >= 0 and i6 < minIdx: + # minIdx = i6 + #i7 = url.find('7') + #if i7 >= 0 and i7 < minIdx: + # minIdx = i7 + #i8 = url.find('8') + #if i8 >= 0 and i8 < minIdx: + # minIdx = i8 + #i9 = url.find('9') + #if i9 >= 0 and i9 < minIdx: + # minIdx = i9 + #return url[0:minIdx] + '_' + url[minIdx+1:] + return url def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - at around HKT 5.30am, all news are available - dt_local = dt_utc - datetime.timedelta(-2.5/24) + # convert UTC to local hk time - at around HKT 6.00am, all news are available + dt_local = dt_utc - datetime.timedelta(-2.0/24) return dt_local.strftime("%Y%m%d") def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), + (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), + ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), + (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + # special - finance + fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + # special - eco-friendly + # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm') + # if eco_articles: + # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles)) + # special - entertainment + #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + #if ent_articles: + # feeds.append(('Entertainment', ent_articles)) return feeds def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls and url.rfind('Redirect') == -1: + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def parse_fin_section(self, url): dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href= True) + current_articles = [] + for i in a: + url = i.get('href', False) + if not url.rfind(dateStr) == -1 and url.rfind('index') == -1: + title = self.tag_to_string(i) + url = 'http://www.mpfinance.com/cfm/' +url + current_articles.append({'title': title, 'url': url, 'description':''}) + return current_articles + + def parse_eco_section(self, url): soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet']}) current_articles = [] @@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url - if url not in included_urls: + url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url + if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles + #def parse_ent_section(self, url): + # dateStr = self.get_fetchdate() + # soup = self.index_to_soup(url) + # a = soup.findAll('a', href=True) + # current_articles = [] + # included_urls = [] + # for i in a: + # title = self.tag_to_string(i) + # url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) + # if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '': + # current_articles.append({'title': title, 'url': url, 'description': ''}) + # return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(style=True): + del item['width'] + for item in soup.findAll(stype=True): + del item['absmiddle'] + return soup + + def create_opf(self, feeds, dir=None): + #super(MPHKRecipe,self).create_opf(feeds, dir) + if dir is None: + dir = self.output_dir + title = self.short_title() + if self.output_profile.periodical_date_in_title: + title += strftime(self.timefmt) + mi = MetaInformation(title, [__appname__]) + mi.publisher = __appname__ + mi.author_sort = __appname__ + mi.publication_type = self.publication_type+':'+self.short_title() + mi.timestamp = nowf() + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + mi.pubdate = nowf() + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) + + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) + + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) + + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) + + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} + + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + play_order=po, author=auth, description=desc) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) + From 1080fdab8fe63e67c3ecde0361a209c4d076ed16 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Nov 2010 11:24:38 -0700 Subject: [PATCH 17/18] Reduce startup times for users with huge calibre libraries by caching the cover check filesystem queries. The reduction will be most pronounced on filesystems that store unindexed directory names. --- src/calibre/library/caches.py | 3 -- src/calibre/library/database2.py | 45 +++++++++++++------------- src/calibre/library/schema_upgrades.py | 16 +++++++++ src/calibre/library/sqlite.py | 3 +- 4 files changed, 40 insertions(+), 27 deletions(-) diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 5f3e66beef..075fbe664a 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -670,7 +670,6 @@ class ResultCache(SearchQueryParser): # {{{ for id in ids: try: self._data[id] = db.conn.get('SELECT * from meta2 WHERE id=?', (id,))[0] - self._data[id].append(db.has_cover(id, index_is_id=True)) self._data[id].append(db.book_on_device_string(id)) self._data[id].append(None) if len(self.composites) > 0: @@ -691,7 +690,6 @@ class ResultCache(SearchQueryParser): # {{{ self._data.extend(repeat(None, max(ids)-len(self._data)+2)) for id in ids: self._data[id] = db.conn.get('SELECT * from meta2 WHERE id=?', (id,))[0] - self._data[id].append(db.has_cover(id, index_is_id=True)) self._data[id].append(db.book_on_device_string(id)) self._data[id].append(None) if len(self.composites) > 0: @@ -721,7 +719,6 @@ class ResultCache(SearchQueryParser): # {{{ self._data[r[0]] = r for item in self._data: if item is not None: - item.append(db.has_cover(item[0], index_is_id=True)) item.append(db.book_on_device_string(item[0])) item.append(None) if len(self.composites) > 0: diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 21a54a4dd6..18161d2230 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -226,7 +226,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): 'lccn', 'pubdate', 'flags', - 'uuid' + 'uuid', + 'has_cover' ] lines = [] for col in columns: @@ -245,7 +246,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): 'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8, 'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12, 'formats':13, 'isbn':14, 'path':15, - 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19} + 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20} for k,v in self.FIELD_MAP.iteritems(): self.field_metadata.set_field_record_index(k, v, prefer_custom=False) @@ -267,12 +268,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): base, prefer_custom=True) - self.FIELD_MAP['cover'] = base+1 - self.field_metadata.set_field_record_index('cover', base+1, prefer_custom=False) - self.FIELD_MAP['ondevice'] = base+2 - self.field_metadata.set_field_record_index('ondevice', base+2, prefer_custom=False) - self.FIELD_MAP['all_metadata'] = base+3 - self.field_metadata.set_field_record_index('all_metadata', base+3, prefer_custom=False) + self.field_metadata.set_field_record_index('cover', + self.FIELD_MAP['cover'], prefer_custom=False) + self.FIELD_MAP['ondevice'] = base+1 + self.field_metadata.set_field_record_index('ondevice', base+1, prefer_custom=False) + self.FIELD_MAP['all_metadata'] = base+2 + self.field_metadata.set_field_record_index('all_metadata', base+2, prefer_custom=False) script = ''' DROP VIEW IF EXISTS meta2; @@ -332,7 +333,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): self.dirtied_cache = set([x[0] for x in d]) self.refresh_ondevice = functools.partial(self.data.refresh_ondevice, self) + st = time.time() self.refresh() + print 'refresh time:', time.time() - st self.last_update_check = self.last_modified() @@ -763,17 +766,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): identical_book_ids.add(book_id) return identical_book_ids - def has_cover(self, index, index_is_id=False): - id = index if index_is_id else self.id(index) - try: - path = os.path.join(self.abspath(id, index_is_id=True, - create_dirs=False), 'cover.jpg') - except: - # Can happen if path has not yet been set - return False - return os.path.exists(path) - - def remove_cover(self, id, notify=True): + def remove_cover(self, id, notify=True, commit=True): path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg') if os.path.exists(path): try: @@ -781,11 +774,14 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except (IOError, OSError): time.sleep(0.2) os.remove(path) + self.conn.execute('UPDATE books SET has_cover=0 WHERE id=?', (id,)) + if commit: + self.conn.commit() self.data.set(id, self.FIELD_MAP['cover'], False, row_is_id=True) if notify: self.notify('cover', [id]) - def set_cover(self, id, data, notify=True): + def set_cover(self, id, data, notify=True, commit=True): ''' Set the cover for this book. @@ -802,6 +798,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except (IOError, OSError): time.sleep(0.2) save_cover_data_to(data, path) + self.conn.execute('UPDATE books SET has_cover=1 WHERE id=?', (id,)) + if commit: + self.conn.commit() self.data.set(id, self.FIELD_MAP['cover'], True, row_is_id=True) if notify: self.notify('cover', [id]) @@ -1273,11 +1272,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): if mi.series: doit(self.set_series, id, mi.series, notify=False, commit=False) if mi.cover_data[1] is not None: - doit(self.set_cover, id, mi.cover_data[1]) # doesn't use commit + doit(self.set_cover, id, mi.cover_data[1], commit=False) elif mi.cover is not None: if os.access(mi.cover, os.R_OK): with lopen(mi.cover, 'rb') as f: - doit(self.set_cover, id, f) + doit(self.set_cover, id, f, commit=False) if mi.tags: doit(self.set_tags, id, mi.tags, notify=False, commit=False) if mi.comments: @@ -2291,7 +2290,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): x['tags'] = [i.replace('|', ',').strip() for i in x['tags'].split(',')] if x['tags'] else [] path = os.path.join(prefix, self.path(record[self.FIELD_MAP['id']], index_is_id=True)) x['cover'] = os.path.join(path, 'cover.jpg') - if not self.has_cover(x['id'], index_is_id=True): + if not record[self.FIELD_MAP['cover']]: x['cover'] = None formats = self.formats(record[self.FIELD_MAP['id']], index_is_id=True) if formats: diff --git a/src/calibre/library/schema_upgrades.py b/src/calibre/library/schema_upgrades.py index 167cc0a327..e35c8521ce 100644 --- a/src/calibre/library/schema_upgrades.py +++ b/src/calibre/library/schema_upgrades.py @@ -6,6 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import os + class SchemaUpgrade(object): def __init__(self): @@ -409,3 +411,17 @@ class SchemaUpgrade(object): ''' self.conn.executescript(script) + def upgrade_version_14(self): + 'Cache has_cover' + self.conn.execute('ALTER TABLE books ADD COLUMN has_cover BOOL DEFAULT 0') + data = self.conn.get('SELECT id,path FROM books', all=True) + def has_cover(path): + if path: + path = os.path.join(self.library_path, path.replace('/', os.sep), + 'cover.jpg') + return os.path.exists(path) + return False + + ids = [(x[0],) for x in data if has_cover(x[1])] + self.conn.executemany('UPDATE books SET has_cover=1 WHERE id=?', ids) + diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py index 1242d0bf7b..eb3302086d 100644 --- a/src/calibre/library/sqlite.py +++ b/src/calibre/library/sqlite.py @@ -34,10 +34,11 @@ sqlite.register_adapter(datetime, adapt_datetime) sqlite.register_converter('timestamp', convert_timestamp) def convert_bool(val): - return bool(int(val)) + return val != '0' sqlite.register_adapter(bool, lambda x : 1 if x else 0) sqlite.register_converter('bool', convert_bool) +sqlite.register_converter('BOOL', convert_bool) class DynamicFilter(object): From f7c49d16091fadfafb405226c812ca52430bdf00 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Nov 2010 12:30:27 -0700 Subject: [PATCH 18/18] Add check for has_cover cache consistency to check db integrity --- src/calibre/gui2/actions/choose_library.py | 4 ++-- src/calibre/library/database2.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/actions/choose_library.py b/src/calibre/gui2/actions/choose_library.py index 01babc8e67..eb5902be48 100644 --- a/src/calibre/gui2/actions/choose_library.py +++ b/src/calibre/gui2/actions/choose_library.py @@ -132,9 +132,9 @@ class CheckIntegrity(QProgressDialog): titles = [self.db.title(x, index_is_id=True) for x in bad] det_msg = '\n'.join(titles) warning_dialog(self, _('Some inconsistencies found'), - _('The following books had formats listed in the ' + _('The following books had formats or covers listed in the ' 'database that are not actually available. ' - 'The entries for the formats have been removed. ' + 'The entries for the formats/covers have been removed. ' 'You should check them manually. This can ' 'happen if you manipulate the files in the ' 'library folder directly.'), det_msg=det_msg, show=True) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 18161d2230..d1d11a70ba 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -2509,11 +2509,20 @@ books_series_link feeds if id not in bad: bad[id] = [] bad[id].append(fmt) + has_cover = self.data.get(id, self.FIELD_MAP['cover'], + row_is_id=True) + if has_cover and self.cover(id, index_is_id=True, as_path=True) is None: + if id not in bad: + bad[id] = [] + bad[id].append('COVER') callback(0.1+0.9*(1+i)/total, _('Checked id') + ' %d'%id) for id in bad: for fmt in bad[id]: - self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, fmt.upper())) + if fmt != 'COVER': + self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, fmt.upper())) + else: + self.conn.execute('UPDATE books SET has_cover=0 WHERE id=?', (id,)) self.conn.commit() self.refresh_ids(list(bad.keys()))