diff --git a/resources/images/news/cnetjapan.png b/resources/images/news/cnetjapan.png new file mode 100644 index 0000000000..9a0dcc8f7f Binary files /dev/null and b/resources/images/news/cnetjapan.png differ diff --git a/resources/images/news/endgadget_ja.png b/resources/images/news/endgadget_ja.png new file mode 100644 index 0000000000..94e8f1219c Binary files /dev/null and b/resources/images/news/endgadget_ja.png differ diff --git a/resources/images/news/jijinews.png b/resources/images/news/jijinews.png new file mode 100644 index 0000000000..b87865fc34 Binary files /dev/null and b/resources/images/news/jijinews.png differ diff --git a/resources/images/news/msnsankei.png b/resources/images/news/msnsankei.png new file mode 100644 index 0000000000..7e92af7b20 Binary files /dev/null and b/resources/images/news/msnsankei.png differ diff --git a/resources/images/news/nikkei_free.png b/resources/images/news/nikkei_free.png new file mode 100644 index 0000000000..308f4b3085 Binary files /dev/null and b/resources/images/news/nikkei_free.png differ diff --git a/resources/images/news/nikkei_sub_economy.png b/resources/images/news/nikkei_sub_economy.png new file mode 100644 index 0000000000..308f4b3085 Binary files /dev/null and b/resources/images/news/nikkei_sub_economy.png differ diff --git a/resources/images/news/nikkei_sub_industory.png b/resources/images/news/nikkei_sub_industory.png new file mode 100644 index 0000000000..308f4b3085 Binary files /dev/null and b/resources/images/news/nikkei_sub_industory.png differ diff --git a/resources/images/news/nikkei_sub_life.png b/resources/images/news/nikkei_sub_life.png new file mode 100644 index 0000000000..308f4b3085 Binary files /dev/null and b/resources/images/news/nikkei_sub_life.png differ diff --git a/resources/images/news/nikkei_sub_main.png b/resources/images/news/nikkei_sub_main.png new file mode 100644 index 0000000000..308f4b3085 Binary files /dev/null and b/resources/images/news/nikkei_sub_main.png differ diff --git a/resources/images/news/nikkei_sub_sports.png b/resources/images/news/nikkei_sub_sports.png new file mode 100644 index 0000000000..308f4b3085 Binary files /dev/null and b/resources/images/news/nikkei_sub_sports.png differ diff --git a/resources/images/news/reuters.png b/resources/images/news/reuters.png new file mode 100644 index 0000000000..f13abce7b6 Binary files /dev/null and b/resources/images/news/reuters.png differ diff --git a/resources/images/news/reuters_ja.png b/resources/images/news/reuters_ja.png new file mode 100644 index 0000000000..f13abce7b6 Binary files /dev/null and b/resources/images/news/reuters_ja.png differ diff --git a/resources/recipes/avto-magazin.recipe b/resources/recipes/avto-magazin.recipe index 6464588acc..adaf74546e 100644 --- a/resources/recipes/avto-magazin.recipe +++ b/resources/recipes/avto-magazin.recipe @@ -13,6 +13,7 @@ class Dnevnik(BasicNewsRecipe): labguage = 'sl' no_stylesheets = True use_embedded_content = False + language = 'sl' conversion_options = {'linearize_tables' : True} diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe index c69dd693b2..71b6aa8cda 100644 --- a/resources/recipes/brand_eins.recipe +++ b/resources/recipes/brand_eins.recipe @@ -91,8 +91,8 @@ class BrandEins(BasicNewsRecipe): latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue] url = pre_latest_issue.get('href', False) - # Get the title for the magazin - build it out of the title of the cover - take the issue and year; - self.title = "brand eins "+ re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') + # Get month and year of the magazine issue - build it out of the title of the cover + self.timefmt = " " + re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') url = 'http://brandeins.de/'+url # url = "http://www.brandeins.de/archiv/magazin/tierisch.html" diff --git a/resources/recipes/cnetjapan.recipe b/resources/recipes/cnetjapan.recipe new file mode 100644 index 0000000000..e0178c1ff2 --- /dev/null +++ b/resources/recipes/cnetjapan.recipe @@ -0,0 +1,32 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class CNetJapan(BasicNewsRecipe): + title = u'CNET Japan' + oldest_article = 3 + max_articles_per_feed = 30 + __author__ = 'Hiroshi Miura' + + feeds = [(u'cnet rss', u'http://feeds.japan.cnet.com/cnet/rss')] + language = 'ja' + encoding = 'Shift_JIS' + remove_javascript = True + + preprocess_regexps = [ + (re.compile(ur'.*', re.DOTALL|re.IGNORECASE|re.UNICODE), + lambda match: ''), + (re.compile(r'.*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(ur'.*', re.UNICODE), + lambda match: ''), + ] + + remove_tags_before = dict(name="h2") + remove_tags = [ + {'class':"social_bkm_share"}, + {'class':"social_bkm_print"}, + {'class':"block20 clearfix"}, + dict(name="div",attrs={'id':'bookreview'}), + ] + remove_tags_after = {'class':"block20"} + diff --git a/resources/recipes/endgadget_ja.recipe b/resources/recipes/endgadget_ja.recipe new file mode 100644 index 0000000000..443a85905d --- /dev/null +++ b/resources/recipes/endgadget_ja.recipe @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +japan.engadget.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class EndgadgetJapan(BasicNewsRecipe): + title = u'Endgadget\u65e5\u672c\u7248' + language = 'ja' + __author__ = 'Hiroshi Miura' + cover_url = 'http://skins18.wincustomize.com/1/49/149320/29/7578/preview-29-7578.jpg' + masthead_url = 'http://www.blogsmithmedia.com/japanese.engadget.com/media/eng-jp-logo-t.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + language = 'ja' + encoding = 'utf-8' + feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')] diff --git a/resources/recipes/jijinews.recipe b/resources/recipes/jijinews.recipe new file mode 100644 index 0000000000..f74864365d --- /dev/null +++ b/resources/recipes/jijinews.recipe @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.jiji.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class JijiDotCom(BasicNewsRecipe): + title = u'\u6642\u4e8b\u901a\u4fe1' + __author__ = 'Hiroshi Miura' + description = 'World News from Jiji Press' + publisher = 'Jiji Press Ltd.' + category = 'news' + encoding = 'utf-8' + oldest_article = 6 + max_articles_per_feed = 100 + language = 'ja' + cover_url = 'http://www.jiji.com/img/top_header_logo2.gif' + masthead_url = 'http://jen.jiji.com/images/logo_jijipress.gif' + + feeds = [(u'\u30cb\u30e5\u30fc\u30b9', u'http://www.jiji.com/rss/ranking.rdf')] + remove_tags_after = dict(id="ad_google") + diff --git a/resources/recipes/mainichi.recipe b/resources/recipes/mainichi.recipe new file mode 100644 index 0000000000..47dc7d0ebc --- /dev/null +++ b/resources/recipes/mainichi.recipe @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.mainichi.jp +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class MainichiDailyNews(BasicNewsRecipe): + title = u'\u6bce\u65e5\u65b0\u805e' + __author__ = 'Hiroshi Miura' + oldest_article = 2 + max_articles_per_feed = 20 + description = 'Japanese traditional newspaper Mainichi Daily News' + publisher = 'Mainichi Daily News' + category = 'news, japan' + language = 'ja' + + feeds = [(u'daily news', u'http://mainichi.jp/rss/etc/flash.rss')] + + remove_tags_before = {'class':"NewsTitle"} + remove_tags = [{'class':"RelatedArticle"}] + remove_tags_after = {'class':"Credit"} + diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe new file mode 100644 index 0000000000..8e15496e57 --- /dev/null +++ b/resources/recipes/mainichi_it_news.recipe @@ -0,0 +1,18 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class MainichiDailyITNews(BasicNewsRecipe): + title = u'\u6bce\u65e5\u65b0\u805e(IT&\u5bb6\u96fb)' + __author__ = 'Hiroshi Miura' + oldest_article = 2 + max_articles_per_feed = 100 + description = 'Japanese traditional newspaper Mainichi Daily News - IT and electronics' + publisher = 'Mainichi Daily News' + category = 'news, Japan, IT, Electronics' + language = 'ja' + + feeds = [(u'IT News', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')] + + remove_tags_before = {'class':"NewsTitle"} + remove_tags = [{'class':"RelatedArticle"}] + remove_tags_after = {'class':"Credit"} + diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 162a3c774e..385dbdbdb7 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty Change Log: +2010/11/22: add English section, remove eco-news section which is not updated daily, correct + ordering of articles +2010/11/12: add news image and eco-news section +2010/11/08: add parsing of finance section +2010/11/06: temporary work-around for Kindle device having no capability to display unicode + in section/article list. 2010/10/31: skip repeated articles in section pages ''' -import datetime +import os, datetime, re from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested -class AdvancedUserRecipe1278063072(BasicNewsRecipe): + +from calibre import __appname__, strftime +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation +from calibre.utils.date import now as nowf + +class MPHKRecipe(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 @@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}' + #extra_css = 'img {float:right; margin:4px;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), + #dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page + dict(attrs={'class':['photo']}), + dict(attrs={'id':['newscontent']}), dict(attrs={'id':['newscontent01','newscontent02']})] + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']})] # for the finance page + remove_attributes = ['width'] + preprocess_regexps = [ + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + def image_url_processor(cls, baseurl, url): + # trick: break the url at the first occurance of digit, add an additional + # '_' at the front + # not working, may need to move this to preprocess_html() method + #minIdx = 10000 + #i0 = url.find('0') + #if i0 >= 0 and i0 < minIdx: + # minIdx = i0 + #i1 = url.find('1') + #if i1 >= 0 and i1 < minIdx: + # minIdx = i1 + #i2 = url.find('2') + #if i2 >= 0 and i2 < minIdx: + # minIdx = i2 + #i3 = url.find('3') + #if i3 >= 0 and i0 < minIdx: + # minIdx = i3 + #i4 = url.find('4') + #if i4 >= 0 and i4 < minIdx: + # minIdx = i4 + #i5 = url.find('5') + #if i5 >= 0 and i5 < minIdx: + # minIdx = i5 + #i6 = url.find('6') + #if i6 >= 0 and i6 < minIdx: + # minIdx = i6 + #i7 = url.find('7') + #if i7 >= 0 and i7 < minIdx: + # minIdx = i7 + #i8 = url.find('8') + #if i8 >= 0 and i8 < minIdx: + # minIdx = i8 + #i9 = url.find('9') + #if i9 >= 0 and i9 < minIdx: + # minIdx = i9 + #return url[0:minIdx] + '_' + url[minIdx+1:] + return url def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - at around HKT 5.30am, all news are available - dt_local = dt_utc - datetime.timedelta(-2.5/24) + # convert UTC to local hk time - at around HKT 6.00am, all news are available + dt_local = dt_utc - datetime.timedelta(-2.0/24) return dt_local.strftime("%Y%m%d") def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), + (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), + ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), + (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + # special - finance + fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + # special - eco-friendly + # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm') + # if eco_articles: + # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles)) + # special - entertainment + #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + #if ent_articles: + # feeds.append(('Entertainment', ent_articles)) return feeds def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls and url.rfind('Redirect') == -1: + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def parse_fin_section(self, url): dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href= True) + current_articles = [] + for i in a: + url = i.get('href', False) + if not url.rfind(dateStr) == -1 and url.rfind('index') == -1: + title = self.tag_to_string(i) + url = 'http://www.mpfinance.com/cfm/' +url + current_articles.append({'title': title, 'url': url, 'description':''}) + return current_articles + + def parse_eco_section(self, url): soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet']}) current_articles = [] @@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url - if url not in included_urls: + url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url + if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles + #def parse_ent_section(self, url): + # dateStr = self.get_fetchdate() + # soup = self.index_to_soup(url) + # a = soup.findAll('a', href=True) + # current_articles = [] + # included_urls = [] + # for i in a: + # title = self.tag_to_string(i) + # url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) + # if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '': + # current_articles.append({'title': title, 'url': url, 'description': ''}) + # return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(style=True): + del item['width'] + for item in soup.findAll(stype=True): + del item['absmiddle'] + return soup + + def create_opf(self, feeds, dir=None): + #super(MPHKRecipe,self).create_opf(feeds, dir) + if dir is None: + dir = self.output_dir + title = self.short_title() + if self.output_profile.periodical_date_in_title: + title += strftime(self.timefmt) + mi = MetaInformation(title, [__appname__]) + mi.publisher = __appname__ + mi.author_sort = __appname__ + mi.publication_type = self.publication_type+':'+self.short_title() + mi.timestamp = nowf() + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + mi.pubdate = nowf() + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) + + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) + + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) + + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) + + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} + + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + play_order=po, author=auth, description=desc) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) + diff --git a/resources/recipes/msnsankei.recipe b/resources/recipes/msnsankei.recipe new file mode 100644 index 0000000000..4c79771945 --- /dev/null +++ b/resources/recipes/msnsankei.recipe @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +sankei.jp.msn.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class MSNSankeiNewsProduct(BasicNewsRecipe): + title = u'MSN\u7523\u7d4c\u30cb\u30e5\u30fc\u30b9(\u65b0\u5546\u54c1)' + __author__ = 'Hiroshi Miura' + description = 'Products release from Japan' + oldest_article = 7 + max_articles_per_feed = 100 + encoding = 'Shift_JIS' + language = 'ja' + + feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')] + + remove_tags_before = dict(id="__r_article_title__") + remove_tags_after = dict(id="ajax_release_news") + remove_tags = [{'class':"parent chromeCustom6G"}] diff --git a/resources/recipes/nikkei_free.recipe b/resources/recipes/nikkei_free.recipe new file mode 100644 index 0000000000..d84aaa279b --- /dev/null +++ b/resources/recipes/nikkei_free.recipe @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class NikkeiNet(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Free)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'), + (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'), + (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'), + (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'), + (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research') + ] + + remove_tags_before = dict(id="CONTENTS") + remove_tags = [ + dict(name="form"), + {'class':"cmn-hide"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe new file mode 100644 index 0000000000..d762f505d1 --- /dev/null +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + +class NikkeiNet_sub_economy(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7d4c\u6e08)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/nikkei_sub_industry.recipe b/resources/recipes/nikkei_sub_industry.recipe new file mode 100644 index 0000000000..da04bbb5f3 --- /dev/null +++ b/resources/recipes/nikkei_sub_industry.recipe @@ -0,0 +1,108 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_industory(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe new file mode 100644 index 0000000000..2da5b13834 --- /dev/null +++ b/resources/recipes/nikkei_sub_life.recipe @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_life(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking') + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/nikkei_sub_main.recipe b/resources/recipes/nikkei_sub_main.recipe new file mode 100644 index 0000000000..142edf624d --- /dev/null +++ b/resources/recipes/nikkei_sub_main.recipe @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_main(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7dcf\u5408)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ (u'NIKKEI', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=main')] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/nikkei_sub_sports.recipe b/resources/recipes/nikkei_sub_sports.recipe new file mode 100644 index 0000000000..6e5a1c6bb2 --- /dev/null +++ b/resources/recipes/nikkei_sub_sports.recipe @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.nikkei.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_sports(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u30b9\u30dd\u30fc\u30c4)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba') + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + diff --git a/resources/recipes/now_toronto.recipe b/resources/recipes/now_toronto.recipe index 41741dbccb..52a4619266 100644 --- a/resources/recipes/now_toronto.recipe +++ b/resources/recipes/now_toronto.recipe @@ -13,6 +13,7 @@ class NowToronto(BasicNewsRecipe): title = u'Now Toronto' description = u'Now Toronto' __author__ = 'Starson17' + language = 'en_CA' conversion_options = { 'no_default_epub_cover' : True } diff --git a/resources/recipes/reuters_ja.recipe b/resources/recipes/reuters_ja.recipe new file mode 100644 index 0000000000..ffa084bc88 --- /dev/null +++ b/resources/recipes/reuters_ja.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class ReutersJa(BasicNewsRecipe): + + title = 'Reuters(Japan)' + description = 'Global news in Japanese' + __author__ = 'Hiroshi Miura' + use_embedded_content = False + language = 'ja' + max_articles_per_feed = 10 + remove_javascript = True + + feeds = [ ('Top Stories', 'http://feeds.reuters.com/reuters/JPTopNews?format=xml'), + ('World News', 'http://feeds.reuters.com/reuters/JPWorldNews?format=xml'), + ('Business News', 'http://feeds.reuters.com/reuters/JPBusinessNews?format=xml'), + ('Technology News', 'http://feeds.reuters.com/reuters/JPTechnologyNews?format=xml'), + ('Oddly Enough News', 'http://feeds.reuters.com/reuters/JPOddlyEnoughNews?format=xml') + ] + + remove_tags_before = {'class':"article primaryContent"} + remove_tags = [ dict(id="banner"), + dict(id="autilities"), + dict(id="textSizer"), + dict(id="shareFooter"), + dict(id="relatedNews"), + dict(id="editorsChoice"), + dict(id="ecArticles"), + {'class':"secondaryContent"}, + {'class':"module"}, + ] + remove_tags_after = {'class':"assetBuddy"} + + def print_version(self, url): + m = re.search('(.*idJPJAPAN-[0-9]+)', url) + return m.group(0)+'?sp=true' + diff --git a/resources/recipes/the_h.recipe b/resources/recipes/the_h.recipe new file mode 100644 index 0000000000..dbfad7e32a --- /dev/null +++ b/resources/recipes/the_h.recipe @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +www.h-online.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class TheHeiseOnline(BasicNewsRecipe): + title = u'The H' + __author__ = 'Hiroshi Miura' + oldest_article = 3 + description = 'In association with Heise Online' + publisher = 'Heise Media UK Ltd.' + category = 'news, technology, security' + max_articles_per_feed = 100 + language = 'en' + encoding = 'utf-8' + conversion_options = { + 'comment' : description + ,'tags' : category + ,'publisher': publisher + ,'language' : language + } + feeds = [ + (u'The H News Feed', u'http://www.h-online.com/news/atom.xml') + ] + + def print_version(self, url): + return url + '?view=print' + diff --git a/src/calibre/gui2/actions/choose_library.py b/src/calibre/gui2/actions/choose_library.py index 01babc8e67..eb5902be48 100644 --- a/src/calibre/gui2/actions/choose_library.py +++ b/src/calibre/gui2/actions/choose_library.py @@ -132,9 +132,9 @@ class CheckIntegrity(QProgressDialog): titles = [self.db.title(x, index_is_id=True) for x in bad] det_msg = '\n'.join(titles) warning_dialog(self, _('Some inconsistencies found'), - _('The following books had formats listed in the ' + _('The following books had formats or covers listed in the ' 'database that are not actually available. ' - 'The entries for the formats have been removed. ' + 'The entries for the formats/covers have been removed. ' 'You should check them manually. This can ' 'happen if you manipulate the files in the ' 'library folder directly.'), det_msg=det_msg, show=True) diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 5f3e66beef..075fbe664a 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -670,7 +670,6 @@ class ResultCache(SearchQueryParser): # {{{ for id in ids: try: self._data[id] = db.conn.get('SELECT * from meta2 WHERE id=?', (id,))[0] - self._data[id].append(db.has_cover(id, index_is_id=True)) self._data[id].append(db.book_on_device_string(id)) self._data[id].append(None) if len(self.composites) > 0: @@ -691,7 +690,6 @@ class ResultCache(SearchQueryParser): # {{{ self._data.extend(repeat(None, max(ids)-len(self._data)+2)) for id in ids: self._data[id] = db.conn.get('SELECT * from meta2 WHERE id=?', (id,))[0] - self._data[id].append(db.has_cover(id, index_is_id=True)) self._data[id].append(db.book_on_device_string(id)) self._data[id].append(None) if len(self.composites) > 0: @@ -721,7 +719,6 @@ class ResultCache(SearchQueryParser): # {{{ self._data[r[0]] = r for item in self._data: if item is not None: - item.append(db.has_cover(item[0], index_is_id=True)) item.append(db.book_on_device_string(item[0])) item.append(None) if len(self.composites) > 0: diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 21a54a4dd6..d1d11a70ba 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -226,7 +226,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): 'lccn', 'pubdate', 'flags', - 'uuid' + 'uuid', + 'has_cover' ] lines = [] for col in columns: @@ -245,7 +246,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): 'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8, 'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12, 'formats':13, 'isbn':14, 'path':15, - 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19} + 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20} for k,v in self.FIELD_MAP.iteritems(): self.field_metadata.set_field_record_index(k, v, prefer_custom=False) @@ -267,12 +268,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): base, prefer_custom=True) - self.FIELD_MAP['cover'] = base+1 - self.field_metadata.set_field_record_index('cover', base+1, prefer_custom=False) - self.FIELD_MAP['ondevice'] = base+2 - self.field_metadata.set_field_record_index('ondevice', base+2, prefer_custom=False) - self.FIELD_MAP['all_metadata'] = base+3 - self.field_metadata.set_field_record_index('all_metadata', base+3, prefer_custom=False) + self.field_metadata.set_field_record_index('cover', + self.FIELD_MAP['cover'], prefer_custom=False) + self.FIELD_MAP['ondevice'] = base+1 + self.field_metadata.set_field_record_index('ondevice', base+1, prefer_custom=False) + self.FIELD_MAP['all_metadata'] = base+2 + self.field_metadata.set_field_record_index('all_metadata', base+2, prefer_custom=False) script = ''' DROP VIEW IF EXISTS meta2; @@ -332,7 +333,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): self.dirtied_cache = set([x[0] for x in d]) self.refresh_ondevice = functools.partial(self.data.refresh_ondevice, self) + st = time.time() self.refresh() + print 'refresh time:', time.time() - st self.last_update_check = self.last_modified() @@ -763,17 +766,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): identical_book_ids.add(book_id) return identical_book_ids - def has_cover(self, index, index_is_id=False): - id = index if index_is_id else self.id(index) - try: - path = os.path.join(self.abspath(id, index_is_id=True, - create_dirs=False), 'cover.jpg') - except: - # Can happen if path has not yet been set - return False - return os.path.exists(path) - - def remove_cover(self, id, notify=True): + def remove_cover(self, id, notify=True, commit=True): path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg') if os.path.exists(path): try: @@ -781,11 +774,14 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except (IOError, OSError): time.sleep(0.2) os.remove(path) + self.conn.execute('UPDATE books SET has_cover=0 WHERE id=?', (id,)) + if commit: + self.conn.commit() self.data.set(id, self.FIELD_MAP['cover'], False, row_is_id=True) if notify: self.notify('cover', [id]) - def set_cover(self, id, data, notify=True): + def set_cover(self, id, data, notify=True, commit=True): ''' Set the cover for this book. @@ -802,6 +798,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except (IOError, OSError): time.sleep(0.2) save_cover_data_to(data, path) + self.conn.execute('UPDATE books SET has_cover=1 WHERE id=?', (id,)) + if commit: + self.conn.commit() self.data.set(id, self.FIELD_MAP['cover'], True, row_is_id=True) if notify: self.notify('cover', [id]) @@ -1273,11 +1272,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): if mi.series: doit(self.set_series, id, mi.series, notify=False, commit=False) if mi.cover_data[1] is not None: - doit(self.set_cover, id, mi.cover_data[1]) # doesn't use commit + doit(self.set_cover, id, mi.cover_data[1], commit=False) elif mi.cover is not None: if os.access(mi.cover, os.R_OK): with lopen(mi.cover, 'rb') as f: - doit(self.set_cover, id, f) + doit(self.set_cover, id, f, commit=False) if mi.tags: doit(self.set_tags, id, mi.tags, notify=False, commit=False) if mi.comments: @@ -2291,7 +2290,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): x['tags'] = [i.replace('|', ',').strip() for i in x['tags'].split(',')] if x['tags'] else [] path = os.path.join(prefix, self.path(record[self.FIELD_MAP['id']], index_is_id=True)) x['cover'] = os.path.join(path, 'cover.jpg') - if not self.has_cover(x['id'], index_is_id=True): + if not record[self.FIELD_MAP['cover']]: x['cover'] = None formats = self.formats(record[self.FIELD_MAP['id']], index_is_id=True) if formats: @@ -2510,11 +2509,20 @@ books_series_link feeds if id not in bad: bad[id] = [] bad[id].append(fmt) + has_cover = self.data.get(id, self.FIELD_MAP['cover'], + row_is_id=True) + if has_cover and self.cover(id, index_is_id=True, as_path=True) is None: + if id not in bad: + bad[id] = [] + bad[id].append('COVER') callback(0.1+0.9*(1+i)/total, _('Checked id') + ' %d'%id) for id in bad: for fmt in bad[id]: - self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, fmt.upper())) + if fmt != 'COVER': + self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, fmt.upper())) + else: + self.conn.execute('UPDATE books SET has_cover=0 WHERE id=?', (id,)) self.conn.commit() self.refresh_ids(list(bad.keys())) diff --git a/src/calibre/library/schema_upgrades.py b/src/calibre/library/schema_upgrades.py index 167cc0a327..e35c8521ce 100644 --- a/src/calibre/library/schema_upgrades.py +++ b/src/calibre/library/schema_upgrades.py @@ -6,6 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import os + class SchemaUpgrade(object): def __init__(self): @@ -409,3 +411,17 @@ class SchemaUpgrade(object): ''' self.conn.executescript(script) + def upgrade_version_14(self): + 'Cache has_cover' + self.conn.execute('ALTER TABLE books ADD COLUMN has_cover BOOL DEFAULT 0') + data = self.conn.get('SELECT id,path FROM books', all=True) + def has_cover(path): + if path: + path = os.path.join(self.library_path, path.replace('/', os.sep), + 'cover.jpg') + return os.path.exists(path) + return False + + ids = [(x[0],) for x in data if has_cover(x[1])] + self.conn.executemany('UPDATE books SET has_cover=1 WHERE id=?', ids) + diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py index 1242d0bf7b..eb3302086d 100644 --- a/src/calibre/library/sqlite.py +++ b/src/calibre/library/sqlite.py @@ -34,10 +34,11 @@ sqlite.register_adapter(datetime, adapt_datetime) sqlite.register_converter('timestamp', convert_timestamp) def convert_bool(val): - return bool(int(val)) + return val != '0' sqlite.register_adapter(bool, lambda x : 1 if x else 0) sqlite.register_converter('bool', convert_bool) +sqlite.register_converter('BOOL', convert_bool) class DynamicFilter(object):