From eb0358025d3337720a5c878fcaad41fd3b69803a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 26 Jun 2011 14:42:16 -0600 Subject: [PATCH] Add Ming Pao Vancouver and Toronto by Eddie Lau --- recipes/ming_pao.recipe | 334 +++++++++++------ recipes/ming_pao_toronto.recipe | 594 ++++++++++++++++++++++++++++++ recipes/ming_pao_vancouver.recipe | 594 ++++++++++++++++++++++++++++++ 3 files changed, 1414 insertions(+), 108 deletions(-) create mode 100644 recipes/ming_pao_toronto.recipe create mode 100644 recipes/ming_pao_vancouver.recipe diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index 08ee20cb15..3566fca667 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -1,17 +1,23 @@ -# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2010-2011, Eddie Lau' +# Region - Hong Kong, Vancouver, Toronto +__Region__ = 'Hong Kong' # Users of Kindle 3 with limited system-level CJK support # please replace the following "True" with "False". __MakePeriodical__ = True # Turn below to true if your device supports display of CJK titles __UseChineseTitle__ = False -# Trun below to true if you wish to use life.mingpao.com as the main article source +# Set it to False if you want to skip images +__KeepImages__ = True +# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source __UseLife__ = True + ''' Change Log: +2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source + provide options to remove all images in the file 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages 2011/03/06: add new articles for finance section, also a new section "Columns" 2011/02/28: rearrange the sections @@ -34,21 +40,96 @@ Change Log: import os, datetime, re from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested - - from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation -class MPHKRecipe(BasicNewsRecipe): - title = 'Ming Pao - Hong Kong' +# MAIN CLASS +class MPRecipe(BasicNewsRecipe): + if __Region__ == 'Hong Kong': + title = 'Ming Pao - Hong Kong' + description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' + category = 'Chinese, News, Hong Kong' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' + masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' + keep_only_tags = [dict(name='h1'), + dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title + dict(name='font', attrs={'color':['AA0000']}), # for column articles title + dict(attrs={'id':['newscontent']}), # entertainment and column page content + dict(attrs={'id':['newscontent01','newscontent02']}), + dict(attrs={'class':['photo']}), + dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com + dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com + ] + if __KeepImages__: + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com + dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article + #dict(name='table') # for content fetched from life.mingpao.com + ] + else: + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com + dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article + dict(name='img'), + #dict(name='table') # for content fetched from life.mingpao.com + ] + remove_attributes = ['width'] + preprocess_regexps = [ + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page + lambda match: ''), + # skip
after title in life.mingpao.com fetched article + (re.compile(r"

", re.DOTALL|re.IGNORECASE), + lambda match: "
"), + (re.compile(r"

", re.DOTALL|re.IGNORECASE), + lambda match: "") + ] + elif __Region__ == 'Vancouver': + title = 'Ming Pao - Vancouver' + description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' + category = 'Chinese, News, Vancouver' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' + masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' + keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), + dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}), + dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), + ] + if __KeepImages__: + remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon + else: + remove_tags = [dict(name='img')] + remove_attributes = ['width'] + preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + elif __Region__ == 'Toronto': + title = 'Ming Pao - Toronto' + description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' + category = 'Chinese, News, Toronto' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' + masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' + keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), + dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}), + dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), + ] + if __KeepImages__: + remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon + else: + remove_tags = [dict(name='img')] + remove_attributes = ['width'] + preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' - description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' publisher = 'MingPao' - category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False no_stylesheets = True @@ -57,33 +138,6 @@ class MPHKRecipe(BasicNewsRecipe): recursions = 0 conversion_options = {'linearize_tables':True} timefmt = '' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' - masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' - keep_only_tags = [dict(name='h1'), - dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title - dict(name='font', attrs={'color':['AA0000']}), # for column articles title - dict(attrs={'id':['newscontent']}), # entertainment and column page content - dict(attrs={'id':['newscontent01','newscontent02']}), - dict(attrs={'class':['photo']}), - dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com - ] - remove_tags = [dict(name='style'), - dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com - dict(name='table')] # for content fetched from life.mingpao.com - remove_attributes = ['width'] - preprocess_regexps = [ - (re.compile(r'
', re.DOTALL|re.IGNORECASE), - lambda match: '

'), - (re.compile(r'

', re.DOTALL|re.IGNORECASE), - lambda match: ''), - (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page - lambda match: ''), - # skip
after title in life.mingpao.com fetched article - (re.compile(r"

", re.DOTALL|re.IGNORECASE), - lambda match: "
"), - (re.compile(r"

", re.DOTALL|re.IGNORECASE), - lambda match: "") - ] def image_url_processor(cls, baseurl, url): # trick: break the url at the first occurance of digit, add an additional @@ -124,8 +178,18 @@ class MPHKRecipe(BasicNewsRecipe): def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - at around HKT 6.00am, all news are available - dt_local = dt_utc - datetime.timedelta(-2.0/24) + if __Region__ == 'Hong Kong': + # convert UTC to local hk time - at HKT 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24) + # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) + elif __Region__ == 'Vancouver': + # convert UTC to local Vancouver time - at PST time 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24) + #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24) + elif __Region__ == 'Toronto': + # convert UTC to local Toronto time - at EST time 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24) + #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24) return dt_local def get_fetchdate(self): @@ -135,13 +199,15 @@ class MPHKRecipe(BasicNewsRecipe): return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchday(self): - # dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - at around HKT 6.00am, all news are available - # dt_local = dt_utc - datetime.timedelta(-2.0/24) return self.get_dtlocal().strftime("%d") def get_cover_url(self): - cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' + if __Region__ == 'Hong Kong': + cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' + elif __Region__ == 'Vancouver': + cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' + elif __Region__ == 'Toronto': + cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' br = BasicNewsRecipe.get_browser() try: br.open(cover) @@ -153,76 +219,104 @@ class MPHKRecipe(BasicNewsRecipe): feeds = [] dateStr = self.get_fetchdate() - if __UseLife__: - for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), - (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'), - (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'), - (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'), - (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'), - (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'), - (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), - (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), - (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'), - (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]: - articles = self.parse_section2(url, keystr) + if __Region__ == 'Hong Kong': + if __UseLife__: + for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), + (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'), + (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'), + (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'), + (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'), + (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'), + (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), + (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), + (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'), + (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]: + articles = self.parse_section2(url, keystr) + if articles: + feeds.append((title, articles)) + + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + else: + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special- editorial + ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') + if ed_articles: + feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) + + for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special - finance + #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + + for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special - entertainment + ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + if ent_articles: + feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + + # special- columns + col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn') + if col_articles: + feeds.append((u'\u5c08\u6b04 Columns', col_articles)) + elif __Region__ == 'Vancouver': + for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), + (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), + (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'), + (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'), + (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'), + (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'), + (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), + (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'), + (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'), + (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]: + articles = self.parse_section3(url, 'http://www.mingpaovan.com/') if articles: feeds.append((title, articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) + elif __Region__ == 'Toronto': + for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), + (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'), + (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'), + (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'), + (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'), + (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'), + (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), + (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'), + (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'), + (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]: + articles = self.parse_section3(url, 'http://www.mingpaotor.com/') if articles: feeds.append((title, articles)) - else: - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), - (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), - (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special- editorial - ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') - if ed_articles: - feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) - - for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), - (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), - (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special - finance - #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') - if fin_articles: - feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - - for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special - entertainment - ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - if ent_articles: - feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - - # special- columns - col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn') - if col_articles: - feeds.append((u'\u5c08\u6b04 Columns', col_articles)) - return feeds # parse from news.mingpao.com @@ -256,11 +350,30 @@ class MPHKRecipe(BasicNewsRecipe): title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): + url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article current_articles.append({'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles + # parse from www.mingpaovan.com + def parse_section3(self, url, baseUrl): + self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + title = self.tag_to_string(i) + urlstr = i.get('href', False) + urlstr = baseUrl + '/' + urlstr.replace('../../../', '') + if urlstr not in included_urls: + current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''}) + included_urls.append(urlstr) + current_articles.reverse() + return current_articles + def parse_ed_section(self, url): self.get_fetchdate() soup = self.index_to_soup(url) @@ -338,7 +451,12 @@ class MPHKRecipe(BasicNewsRecipe): if dir is None: dir = self.output_dir if __UseChineseTitle__ == True: - title = u'\u660e\u5831 (\u9999\u6e2f)' + if __Region__ == 'Hong Kong': + title = u'\u660e\u5831 (\u9999\u6e2f)' + elif __Region__ == 'Vancouver': + title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' + elif __Region__ == 'Toronto': + title = u'\u660e\u5831 (\u591a\u502b\u591a)' else: title = self.short_title() # if not generating a periodical, force date to apply in title diff --git a/recipes/ming_pao_toronto.recipe b/recipes/ming_pao_toronto.recipe new file mode 100644 index 0000000000..677a8272b0 --- /dev/null +++ b/recipes/ming_pao_toronto.recipe @@ -0,0 +1,594 @@ +__license__ = 'GPL v3' +__copyright__ = '2010-2011, Eddie Lau' + +# Region - Hong Kong, Vancouver, Toronto +__Region__ = 'Toronto' +# Users of Kindle 3 with limited system-level CJK support +# please replace the following "True" with "False". +__MakePeriodical__ = True +# Turn below to true if your device supports display of CJK titles +__UseChineseTitle__ = False +# Set it to False if you want to skip images +__KeepImages__ = True +# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source +__UseLife__ = True + + +''' +Change Log: +2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source + provide options to remove all images in the file +2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages +2011/03/06: add new articles for finance section, also a new section "Columns" +2011/02/28: rearrange the sections + [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles + View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" + folder in Kindle 3 +2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles + clean up the indentation +2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list + (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) +2010/11/22: add English section, remove eco-news section which is not updated daily, correct + ordering of articles +2010/11/12: add news image and eco-news section +2010/11/08: add parsing of finance section +2010/11/06: temporary work-around for Kindle device having no capability to display unicode + in section/article list. +2010/10/31: skip repeated articles in section pages +''' + +import os, datetime, re +from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation + +# MAIN CLASS +class MPRecipe(BasicNewsRecipe): + if __Region__ == 'Hong Kong': + title = 'Ming Pao - Hong Kong' + description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' + category = 'Chinese, News, Hong Kong' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' + masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' + keep_only_tags = [dict(name='h1'), + dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title + dict(name='font', attrs={'color':['AA0000']}), # for column articles title + dict(attrs={'id':['newscontent']}), # entertainment and column page content + dict(attrs={'id':['newscontent01','newscontent02']}), + dict(attrs={'class':['photo']}), + dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com + dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com + ] + if __KeepImages__: + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com + dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article + #dict(name='table') # for content fetched from life.mingpao.com + ] + else: + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com + dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article + dict(name='img'), + #dict(name='table') # for content fetched from life.mingpao.com + ] + remove_attributes = ['width'] + preprocess_regexps = [ + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page + lambda match: ''), + # skip
after title in life.mingpao.com fetched article + (re.compile(r"

", re.DOTALL|re.IGNORECASE), + lambda match: "
"), + (re.compile(r"

", re.DOTALL|re.IGNORECASE), + lambda match: "") + ] + elif __Region__ == 'Vancouver': + title = 'Ming Pao - Vancouver' + description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' + category = 'Chinese, News, Vancouver' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' + masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' + keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), + dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}), + dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), + ] + if __KeepImages__: + remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon + else: + remove_tags = [dict(name='img')] + remove_attributes = ['width'] + preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + elif __Region__ == 'Toronto': + title = 'Ming Pao - Toronto' + description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' + category = 'Chinese, News, Toronto' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' + masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' + keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), + dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}), + dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), + ] + if __KeepImages__: + remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon + else: + remove_tags = [dict(name='img')] + remove_attributes = ['width'] + preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Eddie Lau' + publisher = 'MingPao' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'zh' + encoding = 'Big5-HKSCS' + recursions = 0 + conversion_options = {'linearize_tables':True} + timefmt = '' + + def image_url_processor(cls, baseurl, url): + # trick: break the url at the first occurance of digit, add an additional + # '_' at the front + # not working, may need to move this to preprocess_html() method +# minIdx = 10000 +# i0 = url.find('0') +# if i0 >= 0 and i0 < minIdx: +# minIdx = i0 +# i1 = url.find('1') +# if i1 >= 0 and i1 < minIdx: +# minIdx = i1 +# i2 = url.find('2') +# if i2 >= 0 and i2 < minIdx: +# minIdx = i2 +# i3 = url.find('3') +# if i3 >= 0 and i0 < minIdx: +# minIdx = i3 +# i4 = url.find('4') +# if i4 >= 0 and i4 < minIdx: +# minIdx = i4 +# i5 = url.find('5') +# if i5 >= 0 and i5 < minIdx: +# minIdx = i5 +# i6 = url.find('6') +# if i6 >= 0 and i6 < minIdx: +# minIdx = i6 +# i7 = url.find('7') +# if i7 >= 0 and i7 < minIdx: +# minIdx = i7 +# i8 = url.find('8') +# if i8 >= 0 and i8 < minIdx: +# minIdx = i8 +# i9 = url.find('9') +# if i9 >= 0 and i9 < minIdx: +# minIdx = i9 + return url + + def get_dtlocal(self): + dt_utc = datetime.datetime.utcnow() + if __Region__ == 'Hong Kong': + # convert UTC to local hk time - at HKT 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24) + # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) + elif __Region__ == 'Vancouver': + # convert UTC to local Vancouver time - at PST time 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24) + #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24) + elif __Region__ == 'Toronto': + # convert UTC to local Toronto time - at EST time 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24) + #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24) + return dt_local + + def get_fetchdate(self): + return self.get_dtlocal().strftime("%Y%m%d") + + def get_fetchformatteddate(self): + return self.get_dtlocal().strftime("%Y-%m-%d") + + def get_fetchday(self): + return self.get_dtlocal().strftime("%d") + + def get_cover_url(self): + if __Region__ == 'Hong Kong': + cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' + elif __Region__ == 'Vancouver': + cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' + elif __Region__ == 'Toronto': + cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + cover = None + return cover + + def parse_index(self): + feeds = [] + dateStr = self.get_fetchdate() + + if __Region__ == 'Hong Kong': + if __UseLife__: + for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), + (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'), + (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'), + (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'), + (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'), + (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'), + (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), + (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), + (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'), + (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]: + articles = self.parse_section2(url, keystr) + if articles: + feeds.append((title, articles)) + + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + else: + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special- editorial + ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') + if ed_articles: + feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) + + for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special - finance + #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + + for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special - entertainment + ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + if ent_articles: + feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + + # special- columns + col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn') + if col_articles: + feeds.append((u'\u5c08\u6b04 Columns', col_articles)) + elif __Region__ == 'Vancouver': + for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), + (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), + (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'), + (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'), + (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'), + (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'), + (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), + (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'), + (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'), + (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]: + articles = self.parse_section3(url, 'http://www.mingpaovan.com/') + if articles: + feeds.append((title, articles)) + elif __Region__ == 'Toronto': + for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), + (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'), + (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'), + (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'), + (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'), + (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'), + (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), + (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'), + (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'), + (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]: + articles = self.parse_section3(url, 'http://www.mingpaotor.com/') + if articles: + feeds.append((title, articles)) + return feeds + + # parse from news.mingpao.com + def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls and url.rfind('Redirect') == -1: + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + # parse from life.mingpao.com + def parse_section2(self, url, keystr): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): + url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + # parse from www.mingpaovan.com + def parse_section3(self, url, baseUrl): + self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + title = self.tag_to_string(i) + urlstr = i.get('href', False) + urlstr = baseUrl + '/' + urlstr.replace('../../../', '') + if urlstr not in included_urls: + current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''}) + included_urls.append(urlstr) + current_articles.reverse() + return current_articles + + def parse_ed_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def parse_fin_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href= True) + current_articles = [] + included_urls = [] + for i in a: + #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: + if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): + title = self.tag_to_string(i) + current_articles.append({'title': title, 'url': url, 'description':''}) + included_urls.append(url) + return current_articles + + def parse_ent_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def parse_col_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(style=True): + del item['width'] + for item in soup.findAll(stype=True): + del item['absmiddle'] + return soup + + def create_opf(self, feeds, dir=None): + if dir is None: + dir = self.output_dir + if __UseChineseTitle__ == True: + if __Region__ == 'Hong Kong': + title = u'\u660e\u5831 (\u9999\u6e2f)' + elif __Region__ == 'Vancouver': + title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' + elif __Region__ == 'Toronto': + title = u'\u660e\u5831 (\u591a\u502b\u591a)' + else: + title = self.short_title() + # if not generating a periodical, force date to apply in title + if __MakePeriodical__ == False: + title = title + ' ' + self.get_fetchformatteddate() + if True: + mi = MetaInformation(title, [self.publisher]) + mi.publisher = self.publisher + mi.author_sort = self.publisher + if __MakePeriodical__ == True: + mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + else: + mi.publication_type = self.publication_type+':'+self.short_title() + #mi.timestamp = nowf() + mi.timestamp = self.get_dtlocal() + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + #mi.pubdate = nowf() + mi.pubdate = self.get_dtlocal() + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) + + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) + + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) + + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) + + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} + + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + play_order=po, author=auth, description=desc) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, self.publisher, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) + diff --git a/recipes/ming_pao_vancouver.recipe b/recipes/ming_pao_vancouver.recipe new file mode 100644 index 0000000000..3312c8f7b8 --- /dev/null +++ b/recipes/ming_pao_vancouver.recipe @@ -0,0 +1,594 @@ +__license__ = 'GPL v3' +__copyright__ = '2010-2011, Eddie Lau' + +# Region - Hong Kong, Vancouver, Toronto +__Region__ = 'Vancouver' +# Users of Kindle 3 with limited system-level CJK support +# please replace the following "True" with "False". +__MakePeriodical__ = True +# Turn below to true if your device supports display of CJK titles +__UseChineseTitle__ = False +# Set it to False if you want to skip images +__KeepImages__ = True +# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source +__UseLife__ = True + + +''' +Change Log: +2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source + provide options to remove all images in the file +2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages +2011/03/06: add new articles for finance section, also a new section "Columns" +2011/02/28: rearrange the sections + [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles + View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" + folder in Kindle 3 +2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles + clean up the indentation +2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list + (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) +2010/11/22: add English section, remove eco-news section which is not updated daily, correct + ordering of articles +2010/11/12: add news image and eco-news section +2010/11/08: add parsing of finance section +2010/11/06: temporary work-around for Kindle device having no capability to display unicode + in section/article list. +2010/10/31: skip repeated articles in section pages +''' + +import os, datetime, re +from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation + +# MAIN CLASS +class MPRecipe(BasicNewsRecipe): + if __Region__ == 'Hong Kong': + title = 'Ming Pao - Hong Kong' + description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' + category = 'Chinese, News, Hong Kong' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' + masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' + keep_only_tags = [dict(name='h1'), + dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title + dict(name='font', attrs={'color':['AA0000']}), # for column articles title + dict(attrs={'id':['newscontent']}), # entertainment and column page content + dict(attrs={'id':['newscontent01','newscontent02']}), + dict(attrs={'class':['photo']}), + dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com + dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com + ] + if __KeepImages__: + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com + dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article + #dict(name='table') # for content fetched from life.mingpao.com + ] + else: + remove_tags = [dict(name='style'), + dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com + dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article + dict(name='img'), + #dict(name='table') # for content fetched from life.mingpao.com + ] + remove_attributes = ['width'] + preprocess_regexps = [ + (re.compile(r'
', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), # for entertainment page + lambda match: ''), + # skip
after title in life.mingpao.com fetched article + (re.compile(r"

", re.DOTALL|re.IGNORECASE), + lambda match: "
"), + (re.compile(r"

", re.DOTALL|re.IGNORECASE), + lambda match: "") + ] + elif __Region__ == 'Vancouver': + title = 'Ming Pao - Vancouver' + description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' + category = 'Chinese, News, Vancouver' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' + masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' + keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), + dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}), + dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), + ] + if __KeepImages__: + remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon + else: + remove_tags = [dict(name='img')] + remove_attributes = ['width'] + preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + elif __Region__ == 'Toronto': + title = 'Ming Pao - Toronto' + description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' + category = 'Chinese, News, Toronto' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' + masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' + keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), + dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}), + dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), + ] + if __KeepImages__: + remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})] # the magnifier icon + else: + remove_tags = [dict(name='img')] + remove_attributes = ['width'] + preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Eddie Lau' + publisher = 'MingPao' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'zh' + encoding = 'Big5-HKSCS' + recursions = 0 + conversion_options = {'linearize_tables':True} + timefmt = '' + + def image_url_processor(cls, baseurl, url): + # trick: break the url at the first occurance of digit, add an additional + # '_' at the front + # not working, may need to move this to preprocess_html() method +# minIdx = 10000 +# i0 = url.find('0') +# if i0 >= 0 and i0 < minIdx: +# minIdx = i0 +# i1 = url.find('1') +# if i1 >= 0 and i1 < minIdx: +# minIdx = i1 +# i2 = url.find('2') +# if i2 >= 0 and i2 < minIdx: +# minIdx = i2 +# i3 = url.find('3') +# if i3 >= 0 and i0 < minIdx: +# minIdx = i3 +# i4 = url.find('4') +# if i4 >= 0 and i4 < minIdx: +# minIdx = i4 +# i5 = url.find('5') +# if i5 >= 0 and i5 < minIdx: +# minIdx = i5 +# i6 = url.find('6') +# if i6 >= 0 and i6 < minIdx: +# minIdx = i6 +# i7 = url.find('7') +# if i7 >= 0 and i7 < minIdx: +# minIdx = i7 +# i8 = url.find('8') +# if i8 >= 0 and i8 < minIdx: +# minIdx = i8 +# i9 = url.find('9') +# if i9 >= 0 and i9 < minIdx: +# minIdx = i9 + return url + + def get_dtlocal(self): + dt_utc = datetime.datetime.utcnow() + if __Region__ == 'Hong Kong': + # convert UTC to local hk time - at HKT 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24) + # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) + elif __Region__ == 'Vancouver': + # convert UTC to local Vancouver time - at PST time 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24) + #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24) + elif __Region__ == 'Toronto': + # convert UTC to local Toronto time - at EST time 4.30am, all news are available + dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24) + #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24) + return dt_local + + def get_fetchdate(self): + return self.get_dtlocal().strftime("%Y%m%d") + + def get_fetchformatteddate(self): + return self.get_dtlocal().strftime("%Y-%m-%d") + + def get_fetchday(self): + return self.get_dtlocal().strftime("%d") + + def get_cover_url(self): + if __Region__ == 'Hong Kong': + cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' + elif __Region__ == 'Vancouver': + cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' + elif __Region__ == 'Toronto': + cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + cover = None + return cover + + def parse_index(self): + feeds = [] + dateStr = self.get_fetchdate() + + if __Region__ == 'Hong Kong': + if __UseLife__: + for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), + (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'), + (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'), + (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'), + (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'), + (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'), + (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), + (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), + (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'), + (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]: + articles = self.parse_section2(url, keystr) + if articles: + feeds.append((title, articles)) + + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + else: + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), + (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special- editorial + ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') + if ed_articles: + feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) + + for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), + (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), + (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special - finance + #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') + fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') + if fin_articles: + feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + + for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + # special - entertainment + ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + if ent_articles: + feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), + (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + + + # special- columns + col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn') + if col_articles: + feeds.append((u'\u5c08\u6b04 Columns', col_articles)) + elif __Region__ == 'Vancouver': + for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), + (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), + (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'), + (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'), + (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'), + (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'), + (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), + (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'), + (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'), + (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]: + articles = self.parse_section3(url, 'http://www.mingpaovan.com/') + if articles: + feeds.append((title, articles)) + elif __Region__ == 'Toronto': + for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), + (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'), + (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'), + (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'), + (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'), + (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'), + (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), + (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'), + (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'), + (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]: + articles = self.parse_section3(url, 'http://www.mingpaotor.com/') + if articles: + feeds.append((title, articles)) + return feeds + + # parse from news.mingpao.com + def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls and url.rfind('Redirect') == -1: + current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + # parse from life.mingpao.com + def parse_section2(self, url, keystr): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): + url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + # parse from www.mingpaovan.com + def parse_section3(self, url, baseUrl): + self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) + current_articles = [] + included_urls = [] + divs.reverse() + for i in divs: + title = self.tag_to_string(i) + urlstr = i.get('href', False) + urlstr = baseUrl + '/' + urlstr.replace('../../../', '') + if urlstr not in included_urls: + current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''}) + included_urls.append(urlstr) + current_articles.reverse() + return current_articles + + def parse_ed_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def parse_fin_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href= True) + current_articles = [] + included_urls = [] + for i in a: + #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: + if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): + title = self.tag_to_string(i) + current_articles.append({'title': title, 'url': url, 'description':''}) + included_urls.append(url) + return current_articles + + def parse_ent_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def parse_col_section(self, url): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(style=True): + del item['width'] + for item in soup.findAll(stype=True): + del item['absmiddle'] + return soup + + def create_opf(self, feeds, dir=None): + if dir is None: + dir = self.output_dir + if __UseChineseTitle__ == True: + if __Region__ == 'Hong Kong': + title = u'\u660e\u5831 (\u9999\u6e2f)' + elif __Region__ == 'Vancouver': + title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' + elif __Region__ == 'Toronto': + title = u'\u660e\u5831 (\u591a\u502b\u591a)' + else: + title = self.short_title() + # if not generating a periodical, force date to apply in title + if __MakePeriodical__ == False: + title = title + ' ' + self.get_fetchformatteddate() + if True: + mi = MetaInformation(title, [self.publisher]) + mi.publisher = self.publisher + mi.author_sort = self.publisher + if __MakePeriodical__ == True: + mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + else: + mi.publication_type = self.publication_type+':'+self.short_title() + #mi.timestamp = nowf() + mi.timestamp = self.get_dtlocal() + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + #mi.pubdate = nowf() + mi.pubdate = self.get_dtlocal() + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) + + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) + + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) + + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) + + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} + + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + play_order=po, author=auth, description=desc) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, self.publisher, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) +