diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 726181f57b..9febcec0e5 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -1,7 +1,9 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Eddie Lau' +__copyright__ = '2010-2011, Eddie Lau' ''' Change Log: +2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles + clean up the indentation 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 2010/11/22: add English section, remove eco-news section which is not updated daily, correct @@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested -from calibre import __appname__ from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation class MPHKRecipe(BasicNewsRecipe): - IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view - + IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' - description = 'Hong Kong Chinese Newspaper' - publisher = 'news.mingpao.com' + description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' + publisher = 'MingPao' category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False @@ -46,19 +46,20 @@ class MPHKRecipe(BasicNewsRecipe): masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title - dict(attrs={'class':['photo']}), dict(attrs={'id':['newscontent']}), # entertainment page content - dict(attrs={'id':['newscontent01','newscontent02']})] + dict(attrs={'id':['newscontent01','newscontent02']}), + dict(attrs={'class':['photo']}) + ] remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']})] # for the finance page remove_attributes = ['width'] preprocess_regexps = [ - (re.compile(r'
', re.DOTALL|re.IGNORECASE), - lambda match: '

'), - (re.compile(r'

', re.DOTALL|re.IGNORECASE), - lambda match: ''), - (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page - lambda match: '') + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page + lambda match: '') ] def image_url_processor(cls, baseurl, url): @@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe): def get_fetchdate(self): return self.get_dtlocal().strftime("%Y%m%d") + def get_fetchformatteddate(self): + return self.get_dtlocal().strftime("%Y-%m-%d") + def get_fetchday(self): # convert UTC to local hk time - at around HKT 6.00am, all news are available return self.get_dtlocal().strftime("%d") @@ -121,84 +125,66 @@ class MPHKRecipe(BasicNewsRecipe): return cover def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), - (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), - (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), - (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), - (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), - (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), - (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), - ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), - (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - # special - finance - fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - if fin_articles: - feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - # special - eco-friendly - # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm') - # if eco_articles: - # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles)) - # special - entertainment - ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - if ent_articles: - feeds.append((u'\u5f71\u8996 Entertainment', ent_articles)) - return feeds + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), + (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), + ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), + (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + # special - finance + fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + # special - entertainment + ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + if ent_articles: + feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + return feeds def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - a = i.find('a', href = True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url - if url not in included_urls and url.rfind('Redirect') == -1: - current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) - included_urls.append(url) - current_articles.reverse() - return current_articles + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls and url.rfind('Redirect') == -1: + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles def parse_fin_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href= True) current_articles = [] - for i in a: - url = i.get('href', False) - if not url.rfind(dateStr) == -1 and url.rfind('index') == -1: - title = self.tag_to_string(i) - url = 'http://www.mpfinance.com/cfm/' +url - current_articles.append({'title': title, 'url': url, 'description':''}) - return current_articles - - def parse_eco_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet']}) - current_articles = [] included_urls = [] - for i in divs: - a = i.find('a', href = True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url - if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1: + for i in a: + url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) + if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: + title = self.tag_to_string(i) current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles def parse_ent_section(self, url): + self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() @@ -223,67 +209,71 @@ class MPHKRecipe(BasicNewsRecipe): return soup def create_opf(self, feeds, dir=None): - if self.IsKindleUsed == False: - super(MPHKRecipe,self).create_opf(feeds, dir) - return if dir is None: dir = self.output_dir - title = self.short_title() - title += ' ' + self.get_fetchdate() - #if self.output_profile.periodical_date_in_title: - # title += strftime(self.timefmt) - mi = MetaInformation(title, [__appname__]) - mi.publisher = __appname__ - mi.author_sort = __appname__ - mi.publication_type = self.publication_type+':'+self.short_title() - #mi.timestamp = nowf() - mi.timestamp = self.get_dtlocal() - mi.comments = self.description - if not isinstance(mi.comments, unicode): - mi.comments = mi.comments.decode('utf-8', 'replace') - #mi.pubdate = nowf() - mi.pubdate = self.get_dtlocal() - opf_path = os.path.join(dir, 'index.opf') - ncx_path = os.path.join(dir, 'index.ncx') - opf = OPFCreator(dir, mi) - # Add mastheadImage entry to section - mp = getattr(self, 'masthead_path', None) - if mp is not None and os.access(mp, os.R_OK): - from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) - ref.type = 'masthead' - ref.title = 'Masthead Image' - opf.guide.append(ref) + if self.IsCJKWellSupported == True: + # use Chinese title + title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate() + else: + # use English title + title = self.short_title() + ' ' + self.get_fetchformatteddate() + if True: # force date in title + # title += strftime(self.timefmt) + mi = MetaInformation(title, [self.publisher]) + mi.publisher = self.publisher + mi.author_sort = self.publisher + if self.IsCJKWellSupported == True: + mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + else: + mi.publication_type = self.publication_type+':'+self.short_title() + #mi.timestamp = nowf() + mi.timestamp = self.get_dtlocal() + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + #mi.pubdate = nowf() + mi.pubdate = self.get_dtlocal() + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) - manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] - manifest.append(os.path.join(dir, 'index.html')) - manifest.append(os.path.join(dir, 'index.ncx')) + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) - # Get cover - cpath = getattr(self, 'cover_path', None) - if cpath is None: - pf = open(os.path.join(dir, 'cover.jpg'), 'wb') - if self.default_cover(pf): - cpath = pf.name - if cpath is not None and os.access(cpath, os.R_OK): - opf.cover = cpath - manifest.append(cpath) + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) - # Get masthead - mpath = getattr(self, 'masthead_path', None) - if mpath is not None and os.access(mpath, os.R_OK): - manifest.append(mpath) + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) - opf.create_manifest_from_files_in(manifest) - for mani in opf.manifest: - if mani.path.endswith('.ncx'): - mani.id = 'ncx' - if mani.path.endswith('mastheadImage.jpg'): - mani.id = 'masthead-image' - entries = ['index.html'] - toc = TOC(base_path=dir) - self.play_order_counter = 0 - self.play_order_map = {} + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} def feed_index(num, parent): f = feeds[num] @@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe): prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, + a.orig_url, self.publisher, prefix=prefix, center=self.center_navbar) elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) @@ -344,7 +334,7 @@ class MPHKRecipe(BasicNewsRecipe): if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html'%i, None, - f.title, play_order=po, description=desc, author=auth)) + f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html'%0) @@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) -