diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe
index 726181f57b..9febcec0e5 100644
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@@ -1,7 +1,9 @@
__license__ = 'GPL v3'
-__copyright__ = '2010, Eddie Lau'
+__copyright__ = '2010-2011, Eddie Lau'
'''
Change Log:
+2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
+ clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
-from calibre import __appname__
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe):
- IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view
-
+ IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
title = 'Ming Pao - Hong Kong'
oldest_article = 1
max_articles_per_feed = 100
__author__ = 'Eddie Lau'
- description = 'Hong Kong Chinese Newspaper'
- publisher = 'news.mingpao.com'
+ description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+ publisher = 'MingPao'
category = 'Chinese, News, Hong Kong'
remove_javascript = True
use_embedded_content = False
@@ -46,19 +46,20 @@ class MPHKRecipe(BasicNewsRecipe):
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
- dict(attrs={'class':['photo']}),
dict(attrs={'id':['newscontent']}), # entertainment page content
- dict(attrs={'id':['newscontent01','newscontent02']})]
+ dict(attrs={'id':['newscontent01','newscontent02']}),
+ dict(attrs={'class':['photo']})
+ ]
remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']})] # for the finance page
remove_attributes = ['width']
preprocess_regexps = [
- (re.compile(r'
', re.DOTALL|re.IGNORECASE),
- lambda match: ''),
- (re.compile(r'
', re.DOTALL|re.IGNORECASE),
- lambda match: ''),
- (re.compile(r'
', re.DOTALL|re.IGNORECASE), # for entertainment page
- lambda match: '')
+ (re.compile(r'', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE), # for entertainment page
+ lambda match: '')
]
def image_url_processor(cls, baseurl, url):
@@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d")
+ def get_fetchformatteddate(self):
+ return self.get_dtlocal().strftime("%Y-%m-%d")
+
def get_fetchday(self):
# convert UTC to local hk time - at around HKT 6.00am, all news are available
return self.get_dtlocal().strftime("%d")
@@ -121,84 +125,66 @@ class MPHKRecipe(BasicNewsRecipe):
return cover
def parse_index(self):
- feeds = []
- dateStr = self.get_fetchdate()
- for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
- (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
- (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
- (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
- (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
- (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
- (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
- ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
- (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
- (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
- (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
- # special - finance
- fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
- if fin_articles:
- feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
- # special - eco-friendly
- # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
- # if eco_articles:
- # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
- # special - entertainment
- ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
- if ent_articles:
- feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
- return feeds
+ feeds = []
+ dateStr = self.get_fetchdate()
+ for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+ (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+ (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
+ (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+ (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+ (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
+ ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+ (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
+ (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
+ (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+ (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+ articles = self.parse_section(url)
+ if articles:
+ feeds.append((title, articles))
+ # special - finance
+ fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+ if fin_articles:
+ feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+ # special - entertainment
+ ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+ if ent_articles:
+ feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+ return feeds
def parse_section(self, url):
- dateStr = self.get_fetchdate()
- soup = self.index_to_soup(url)
- divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
- current_articles = []
- included_urls = []
- divs.reverse()
- for i in divs:
- a = i.find('a', href = True)
- title = self.tag_to_string(a)
- url = a.get('href', False)
- url = 'http://news.mingpao.com/' + dateStr + '/' +url
- if url not in included_urls and url.rfind('Redirect') == -1:
- current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
- included_urls.append(url)
- current_articles.reverse()
- return current_articles
+ dateStr = self.get_fetchdate()
+ soup = self.index_to_soup(url)
+ divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+ current_articles = []
+ included_urls = []
+ divs.reverse()
+ for i in divs:
+ a = i.find('a', href = True)
+ title = self.tag_to_string(a)
+ url = a.get('href', False)
+ url = 'http://news.mingpao.com/' + dateStr + '/' +url
+ if url not in included_urls and url.rfind('Redirect') == -1:
+ current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+ included_urls.append(url)
+ current_articles.reverse()
+ return current_articles
def parse_fin_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href= True)
current_articles = []
- for i in a:
- url = i.get('href', False)
- if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
- title = self.tag_to_string(i)
- url = 'http://www.mpfinance.com/cfm/' +url
- current_articles.append({'title': title, 'url': url, 'description':''})
- return current_articles
-
- def parse_eco_section(self, url):
- dateStr = self.get_fetchdate()
- soup = self.index_to_soup(url)
- divs = soup.findAll(attrs={'class': ['bullet']})
- current_articles = []
included_urls = []
- for i in divs:
- a = i.find('a', href = True)
- title = self.tag_to_string(a)
- url = a.get('href', False)
- url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
- if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
+ for i in a:
+ url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+ if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+ title = self.tag_to_string(i)
current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url)
return current_articles
def parse_ent_section(self, url):
+ self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
@@ -223,67 +209,71 @@ class MPHKRecipe(BasicNewsRecipe):
return soup
def create_opf(self, feeds, dir=None):
- if self.IsKindleUsed == False:
- super(MPHKRecipe,self).create_opf(feeds, dir)
- return
if dir is None:
dir = self.output_dir
- title = self.short_title()
- title += ' ' + self.get_fetchdate()
- #if self.output_profile.periodical_date_in_title:
- # title += strftime(self.timefmt)
- mi = MetaInformation(title, [__appname__])
- mi.publisher = __appname__
- mi.author_sort = __appname__
- mi.publication_type = self.publication_type+':'+self.short_title()
- #mi.timestamp = nowf()
- mi.timestamp = self.get_dtlocal()
- mi.comments = self.description
- if not isinstance(mi.comments, unicode):
- mi.comments = mi.comments.decode('utf-8', 'replace')
- #mi.pubdate = nowf()
- mi.pubdate = self.get_dtlocal()
- opf_path = os.path.join(dir, 'index.opf')
- ncx_path = os.path.join(dir, 'index.ncx')
- opf = OPFCreator(dir, mi)
- # Add mastheadImage entry to section
- mp = getattr(self, 'masthead_path', None)
- if mp is not None and os.access(mp, os.R_OK):
- from calibre.ebooks.metadata.opf2 import Guide
- ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
- ref.type = 'masthead'
- ref.title = 'Masthead Image'
- opf.guide.append(ref)
+ if self.IsCJKWellSupported == True:
+ # use Chinese title
+ title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
+ else:
+ # use English title
+ title = self.short_title() + ' ' + self.get_fetchformatteddate()
+ if True: # force date in title
+ # title += strftime(self.timefmt)
+ mi = MetaInformation(title, [self.publisher])
+ mi.publisher = self.publisher
+ mi.author_sort = self.publisher
+ if self.IsCJKWellSupported == True:
+ mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+ else:
+ mi.publication_type = self.publication_type+':'+self.short_title()
+ #mi.timestamp = nowf()
+ mi.timestamp = self.get_dtlocal()
+ mi.comments = self.description
+ if not isinstance(mi.comments, unicode):
+ mi.comments = mi.comments.decode('utf-8', 'replace')
+ #mi.pubdate = nowf()
+ mi.pubdate = self.get_dtlocal()
+ opf_path = os.path.join(dir, 'index.opf')
+ ncx_path = os.path.join(dir, 'index.ncx')
+ opf = OPFCreator(dir, mi)
+ # Add mastheadImage entry to section
+ mp = getattr(self, 'masthead_path', None)
+ if mp is not None and os.access(mp, os.R_OK):
+ from calibre.ebooks.metadata.opf2 import Guide
+ ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+ ref.type = 'masthead'
+ ref.title = 'Masthead Image'
+ opf.guide.append(ref)
- manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
- manifest.append(os.path.join(dir, 'index.html'))
- manifest.append(os.path.join(dir, 'index.ncx'))
+ manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+ manifest.append(os.path.join(dir, 'index.html'))
+ manifest.append(os.path.join(dir, 'index.ncx'))
- # Get cover
- cpath = getattr(self, 'cover_path', None)
- if cpath is None:
- pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
- if self.default_cover(pf):
- cpath = pf.name
- if cpath is not None and os.access(cpath, os.R_OK):
- opf.cover = cpath
- manifest.append(cpath)
+ # Get cover
+ cpath = getattr(self, 'cover_path', None)
+ if cpath is None:
+ pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+ if self.default_cover(pf):
+ cpath = pf.name
+ if cpath is not None and os.access(cpath, os.R_OK):
+ opf.cover = cpath
+ manifest.append(cpath)
- # Get masthead
- mpath = getattr(self, 'masthead_path', None)
- if mpath is not None and os.access(mpath, os.R_OK):
- manifest.append(mpath)
+ # Get masthead
+ mpath = getattr(self, 'masthead_path', None)
+ if mpath is not None and os.access(mpath, os.R_OK):
+ manifest.append(mpath)
- opf.create_manifest_from_files_in(manifest)
- for mani in opf.manifest:
- if mani.path.endswith('.ncx'):
- mani.id = 'ncx'
- if mani.path.endswith('mastheadImage.jpg'):
- mani.id = 'masthead-image'
- entries = ['index.html']
- toc = TOC(base_path=dir)
- self.play_order_counter = 0
- self.play_order_map = {}
+ opf.create_manifest_from_files_in(manifest)
+ for mani in opf.manifest:
+ if mani.path.endswith('.ncx'):
+ mani.id = 'ncx'
+ if mani.path.endswith('mastheadImage.jpg'):
+ mani.id = 'masthead-image'
+ entries = ['index.html']
+ toc = TOC(base_path=dir)
+ self.play_order_counter = 0
+ self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
@@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
- a.orig_url, __appname__, prefix=prefix,
+ a.orig_url, self.publisher, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
@@ -344,7 +334,7 @@ class MPHKRecipe(BasicNewsRecipe):
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
- f.title, play_order=po, description=desc, author=auth))
+ f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
@@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)
-