diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 162a3c774e..385dbdbdb7 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty Change Log: +2010/11/22: add English section, remove eco-news section which is not updated daily, correct + ordering of articles +2010/11/12: add news image and eco-news section +2010/11/08: add parsing of finance section +2010/11/06: temporary work-around for Kindle device having no capability to display unicode + in section/article list. 2010/10/31: skip repeated articles in section pages ''' -import datetime +import os, datetime, re from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested -class AdvancedUserRecipe1278063072(BasicNewsRecipe): + +from calibre import __appname__, strftime +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation +from calibre.utils.date import now as nowf + +class MPHKRecipe(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 @@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}' + #extra_css = 'img {float:right; margin:4px;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), + #dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page + dict(attrs={'class':['photo']}), + dict(attrs={'id':['newscontent']}), dict(attrs={'id':['newscontent01','newscontent02']})] + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']})] # for the finance page + remove_attributes = ['width'] + preprocess_regexps = [ + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + def image_url_processor(cls, baseurl, url): + # trick: break the url at the first occurance of digit, add an additional + # '_' at the front + # not working, may need to move this to preprocess_html() method + #minIdx = 10000 + #i0 = url.find('0') + #if i0 >= 0 and i0 < minIdx: + # minIdx = i0 + #i1 = url.find('1') + #if i1 >= 0 and i1 < minIdx: + # minIdx = i1 + #i2 = url.find('2') + #if i2 >= 0 and i2 < minIdx: + # minIdx = i2 + #i3 = url.find('3') + #if i3 >= 0 and i0 < minIdx: + # minIdx = i3 + #i4 = url.find('4') + #if i4 >= 0 and i4 < minIdx: + # minIdx = i4 + #i5 = url.find('5') + #if i5 >= 0 and i5 < minIdx: + # minIdx = i5 + #i6 = url.find('6') + #if i6 >= 0 and i6 < minIdx: + # minIdx = i6 + #i7 = url.find('7') + #if i7 >= 0 and i7 < minIdx: + # minIdx = i7 + #i8 = url.find('8') + #if i8 >= 0 and i8 < minIdx: + # minIdx = i8 + #i9 = url.find('9') + #if i9 >= 0 and i9 < minIdx: + # minIdx = i9 + #return url[0:minIdx] + '_' + url[minIdx+1:] + return url def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - at around HKT 5.30am, all news are available - dt_local = dt_utc - datetime.timedelta(-2.5/24) + # convert UTC to local hk time - at around HKT 6.00am, all news are available + dt_local = dt_utc - datetime.timedelta(-2.0/24) return dt_local.strftime("%Y%m%d") def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), + (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), + ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), + (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + # special - finance + fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + # special - eco-friendly + # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm') + # if eco_articles: + # feeds.append((u'\u74b0\u4fdd Eco News', eco_articles)) + # special - entertainment + #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + #if ent_articles: + # feeds.append(('Entertainment', ent_articles)) return feeds def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls and url.rfind('Redirect') == -1: + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def parse_fin_section(self, url): dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href= True) + current_articles = [] + for i in a: + url = i.get('href', False) + if not url.rfind(dateStr) == -1 and url.rfind('index') == -1: + title = self.tag_to_string(i) + url = 'http://www.mpfinance.com/cfm/' +url + current_articles.append({'title': title, 'url': url, 'description':''}) + return current_articles + + def parse_eco_section(self, url): soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet']}) current_articles = [] @@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url - if url not in included_urls: + url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url + if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles + #def parse_ent_section(self, url): + # dateStr = self.get_fetchdate() + # soup = self.index_to_soup(url) + # a = soup.findAll('a', href=True) + # current_articles = [] + # included_urls = [] + # for i in a: + # title = self.tag_to_string(i) + # url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) + # if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '': + # current_articles.append({'title': title, 'url': url, 'description': ''}) + # return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(style=True): + del item['width'] + for item in soup.findAll(stype=True): + del item['absmiddle'] + return soup + + def create_opf(self, feeds, dir=None): + #super(MPHKRecipe,self).create_opf(feeds, dir) + if dir is None: + dir = self.output_dir + title = self.short_title() + if self.output_profile.periodical_date_in_title: + title += strftime(self.timefmt) + mi = MetaInformation(title, [__appname__]) + mi.publisher = __appname__ + mi.author_sort = __appname__ + mi.publication_type = self.publication_type+':'+self.short_title() + mi.timestamp = nowf() + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + mi.pubdate = nowf() + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) + + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) + + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) + + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) + + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} + + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + play_order=po, author=auth, description=desc) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) +