diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe
index 162a3c774e..385dbdbdb7 100644
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau'
'''
modified from Singtao Toronto calibre recipe by rty
Change Log:
+2010/11/22: add English section, remove eco-news section which is not updated daily, correct
+ ordering of articles
+2010/11/12: add news image and eco-news section
+2010/11/08: add parsing of finance section
+2010/11/06: temporary work-around for Kindle device having no capability to display unicode
+ in section/article list.
2010/10/31: skip repeated articles in section pages
'''
-import datetime
+import os, datetime, re
from calibre.web.feeds.recipes import BasicNewsRecipe
+from contextlib import nested
-class AdvancedUserRecipe1278063072(BasicNewsRecipe):
+
+from calibre import __appname__, strftime
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.date import now as nowf
+
+class MPHKRecipe(BasicNewsRecipe):
title = 'Ming Pao - Hong Kong'
oldest_article = 1
max_articles_per_feed = 100
@@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
encoding = 'Big5-HKSCS'
recursions = 0
conversion_options = {'linearize_tables':True}
+ extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}'
+ #extra_css = 'img {float:right; margin:4px;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'),
+ #dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page
+ dict(attrs={'class':['photo']}),
+ dict(attrs={'id':['newscontent']}),
dict(attrs={'id':['newscontent01','newscontent02']})]
+ remove_tags = [dict(name='style'),
+ dict(attrs={'id':['newscontent135']})] # for the finance page
+ remove_attributes = ['width']
+ preprocess_regexps = [
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ ]
+
+ def image_url_processor(cls, baseurl, url):
+ # trick: break the url at the first occurance of digit, add an additional
+ # '_' at the front
+ # not working, may need to move this to preprocess_html() method
+ #minIdx = 10000
+ #i0 = url.find('0')
+ #if i0 >= 0 and i0 < minIdx:
+ # minIdx = i0
+ #i1 = url.find('1')
+ #if i1 >= 0 and i1 < minIdx:
+ # minIdx = i1
+ #i2 = url.find('2')
+ #if i2 >= 0 and i2 < minIdx:
+ # minIdx = i2
+ #i3 = url.find('3')
+ #if i3 >= 0 and i0 < minIdx:
+ # minIdx = i3
+ #i4 = url.find('4')
+ #if i4 >= 0 and i4 < minIdx:
+ # minIdx = i4
+ #i5 = url.find('5')
+ #if i5 >= 0 and i5 < minIdx:
+ # minIdx = i5
+ #i6 = url.find('6')
+ #if i6 >= 0 and i6 < minIdx:
+ # minIdx = i6
+ #i7 = url.find('7')
+ #if i7 >= 0 and i7 < minIdx:
+ # minIdx = i7
+ #i8 = url.find('8')
+ #if i8 >= 0 and i8 < minIdx:
+ # minIdx = i8
+ #i9 = url.find('9')
+ #if i9 >= 0 and i9 < minIdx:
+ # minIdx = i9
+ #return url[0:minIdx] + '_' + url[minIdx+1:]
+ return url
def get_fetchdate(self):
dt_utc = datetime.datetime.utcnow()
- # convert UTC to local hk time - at around HKT 5.30am, all news are available
- dt_local = dt_utc - datetime.timedelta(-2.5/24)
+ # convert UTC to local hk time - at around HKT 6.00am, all news are available
+ dt_local = dt_utc - datetime.timedelta(-2.0/24)
return dt_local.strftime("%Y%m%d")
def parse_index(self):
- feeds = []
- dateStr = self.get_fetchdate()
- for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
+ feeds = []
+ dateStr = self.get_fetchdate()
+ for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+ (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
+ (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+ (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
+ (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+ (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+ (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
+ ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+ (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
+ (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+ (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+ articles = self.parse_section(url)
+ if articles:
+ feeds.append((title, articles))
+ # special - finance
+ fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+ if fin_articles:
+ feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+ # special - eco-friendly
+ # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
+ # if eco_articles:
+ # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
+ # special - entertainment
+ #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+ #if ent_articles:
+ # feeds.append(('Entertainment', ent_articles))
return feeds
def parse_section(self, url):
+ dateStr = self.get_fetchdate()
+ soup = self.index_to_soup(url)
+ divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+ current_articles = []
+ included_urls = []
+ divs.reverse()
+ for i in divs:
+ a = i.find('a', href = True)
+ title = self.tag_to_string(a)
+ url = a.get('href', False)
+ url = 'http://news.mingpao.com/' + dateStr + '/' +url
+ if url not in included_urls and url.rfind('Redirect') == -1:
+ current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+ included_urls.append(url)
+ current_articles.reverse()
+ return current_articles
+
+ def parse_fin_section(self, url):
dateStr = self.get_fetchdate()
+ soup = self.index_to_soup(url)
+ a = soup.findAll('a', href= True)
+ current_articles = []
+ for i in a:
+ url = i.get('href', False)
+ if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+ title = self.tag_to_string(i)
+ url = 'http://www.mpfinance.com/cfm/' +url
+ current_articles.append({'title': title, 'url': url, 'description':''})
+ return current_articles
+
+ def parse_eco_section(self, url):
soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet']})
current_articles = []
@@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
a = i.find('a', href = True)
title = self.tag_to_string(a)
url = a.get('href', False)
- url = 'http://news.mingpao.com/' + dateStr + '/' +url
- if url not in included_urls:
+ url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
+ if url not in included_urls and url.rfind('Redirect') == -1:
current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url)
return current_articles
+ #def parse_ent_section(self, url):
+ # dateStr = self.get_fetchdate()
+ # soup = self.index_to_soup(url)
+ # a = soup.findAll('a', href=True)
+ # current_articles = []
+ # included_urls = []
+ # for i in a:
+ # title = self.tag_to_string(i)
+ # url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
+ # if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '':
+ # current_articles.append({'title': title, 'url': url, 'description': ''})
+ # return current_articles
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for item in soup.findAll(style=True):
+ del item['width']
+ for item in soup.findAll(stype=True):
+ del item['absmiddle']
+ return soup
+
+ def create_opf(self, feeds, dir=None):
+ #super(MPHKRecipe,self).create_opf(feeds, dir)
+ if dir is None:
+ dir = self.output_dir
+ title = self.short_title()
+ if self.output_profile.periodical_date_in_title:
+ title += strftime(self.timefmt)
+ mi = MetaInformation(title, [__appname__])
+ mi.publisher = __appname__
+ mi.author_sort = __appname__
+ mi.publication_type = self.publication_type+':'+self.short_title()
+ mi.timestamp = nowf()
+ mi.comments = self.description
+ if not isinstance(mi.comments, unicode):
+ mi.comments = mi.comments.decode('utf-8', 'replace')
+ mi.pubdate = nowf()
+ opf_path = os.path.join(dir, 'index.opf')
+ ncx_path = os.path.join(dir, 'index.ncx')
+ opf = OPFCreator(dir, mi)
+ # Add mastheadImage entry to section
+ mp = getattr(self, 'masthead_path', None)
+ if mp is not None and os.access(mp, os.R_OK):
+ from calibre.ebooks.metadata.opf2 import Guide
+ ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+ ref.type = 'masthead'
+ ref.title = 'Masthead Image'
+ opf.guide.append(ref)
+
+ manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+ manifest.append(os.path.join(dir, 'index.html'))
+ manifest.append(os.path.join(dir, 'index.ncx'))
+
+ # Get cover
+ cpath = getattr(self, 'cover_path', None)
+ if cpath is None:
+ pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+ if self.default_cover(pf):
+ cpath = pf.name
+ if cpath is not None and os.access(cpath, os.R_OK):
+ opf.cover = cpath
+ manifest.append(cpath)
+
+ # Get masthead
+ mpath = getattr(self, 'masthead_path', None)
+ if mpath is not None and os.access(mpath, os.R_OK):
+ manifest.append(mpath)
+
+ opf.create_manifest_from_files_in(manifest)
+ for mani in opf.manifest:
+ if mani.path.endswith('.ncx'):
+ mani.id = 'ncx'
+ if mani.path.endswith('mastheadImage.jpg'):
+ mani.id = 'masthead-image'
+ entries = ['index.html']
+ toc = TOC(base_path=dir)
+ self.play_order_counter = 0
+ self.play_order_map = {}
+
+ def feed_index(num, parent):
+ f = feeds[num]
+ for j, a in enumerate(f):
+ if getattr(a, 'downloaded', False):
+ adir = 'feed_%d/article_%d/'%(num, j)
+ auth = a.author
+ if not auth:
+ auth = None
+ desc = a.text_summary
+ if not desc:
+ desc = None
+ else:
+ desc = self.description_limiter(desc)
+ entries.append('%sindex.html'%adir)
+ po = self.play_order_map.get(entries[-1], None)
+ if po is None:
+ self.play_order_counter += 1
+ po = self.play_order_counter
+ parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+ play_order=po, author=auth, description=desc)
+ last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+ for sp in a.sub_pages:
+ prefix = os.path.commonprefix([opf_path, sp])
+ relp = sp[len(prefix):]
+ entries.append(relp.replace(os.sep, '/'))
+ last = sp
+
+ if os.path.exists(last):
+ with open(last, 'rb') as fi:
+ src = fi.read().decode('utf-8')
+ soup = BeautifulSoup(src)
+ body = soup.find('body')
+ if body is not None:
+ prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+ templ = self.navbar.generate(True, num, j, len(f),
+ not self.has_single_feed,
+ a.orig_url, __appname__, prefix=prefix,
+ center=self.center_navbar)
+ elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+ body.insert(len(body.contents), elem)
+ with open(last, 'wb') as fi:
+ fi.write(unicode(soup).encode('utf-8'))
+ if len(feeds) == 0:
+ raise Exception('All feeds are empty, aborting.')
+
+ if len(feeds) > 1:
+ for i, f in enumerate(feeds):
+ entries.append('feed_%d/index.html'%i)
+ po = self.play_order_map.get(entries[-1], None)
+ if po is None:
+ self.play_order_counter += 1
+ po = self.play_order_counter
+ auth = getattr(f, 'author', None)
+ if not auth:
+ auth = None
+ desc = getattr(f, 'description', None)
+ if not desc:
+ desc = None
+ feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+ f.title, play_order=po, description=desc, author=auth))
+
+ else:
+ entries.append('feed_%d/index.html'%0)
+ feed_index(0, toc)
+
+ for i, p in enumerate(entries):
+ entries[i] = os.path.join(dir, p.replace('/', os.sep))
+ opf.create_spine(entries)
+ opf.set_toc(toc)
+
+ with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
+ opf.render(opf_file, ncx_file)
+