__license__ = 'GPL v3' __copyright__ = '2010-2013, Eddie Lau' # Region - Hong Kong, Vancouver, Toronto __Region__ = 'Hong Kong' # Users of Kindle 3 with limited system-level CJK support # please replace the following "True" with "False". (Default: True) __MakePeriodical__ = True # Turn below to True if your device supports display of CJK titles # (Default: False) __UseChineseTitle__ = False # Set it to False if you want to skip images (Default: True) __KeepImages__ = True # Set it to True if you want to include a summary in Kindle's article view # (Default: True) __IncludeSummary__ = True # Set it to True if you want thumbnail images in Kindle's article view # (Default: True) __IncludeThumbnails__ = True # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) __UseLife__ = True # (HK only) It is to disable premium content (Default: False) __InclPremium__ = False # (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with their printer-friendly formats (Default: False) __ParsePF__ = False # (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with text formats (Default: True) -- override __ParsePF__ __ParseTxt__ = True # (HK only) Use mobile text version for some articles (Default: False) __ParseSelectedMobile__ = False # (HK only) Turn below to True if you wish hi-res images (Default: True) __HiResImg__ = True # Override the date returned by the program if specifying a YYYYMMDD below # (not work if __ParseSelectedMobile__ is True and __UseLife__ is False) __Date__ = '' ''' Change Log: 2014/10/19: update urls of some web location and top logo 2013/09/28: allow thumbnails even with hi-res images 2012/04/24: improved parsing of news1.mingpao.com content 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/19: fix a bug in txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/04: option to get hi-res photos for the articles 2011/09/21: fetching "column" section is made optional. 2011/09/18: parse "column" section stuff from source text file directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source provide options to remove all images in the file 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages 2011/03/06: add new articles for finance section, also a new section "Columns" 2011/02/28: rearrange the sections [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" folder in Kindle 3 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles clean up the indentation 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 2010/11/22: add English section, remove eco-news section which is not updated daily, correct ordering of articles 2010/11/12: add news image and eco-news section 2010/11/08: add parsing of finance section 2010/11/06: temporary work-around for Kindle device having no capability to display unicode in section/article list. 2010/10/31: skip repeated articles in section pages ''' from calibre.utils.date import now as nowf import os import datetime import re import mechanize from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.utils.localization import canonicalize_lang # MAIN CLASS class MPRecipe(BasicNewsRecipe): if __Region__ == 'Hong Kong': if __UseChineseTitle__ is True: title = u'\u660e\u5831 (\u9999\u6e2f)' else: title = 'Ming Pao - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://news1.mingpao.com)' category = 'Chinese, News, Hong Kong' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' # noqa masthead_url = 'http://news.mingpao.com/image/mingpaonews_logo.png' remove_tags_before = dict(name='font', attrs={'color': ['navy']}) keep_only_tags = [dict(name='h1'), # for entertainment page title dict(name='font', attrs={ 'style': ['font-size:14pt; line-height:160%;']}), # for column articles title dict(name='font', attrs={'color': ['AA0000']}), # for heading from txt dict(attrs={'class': ['heading']}), # entertainment and column page content dict(attrs={'id': ['newscontent']}), dict( attrs={'id': ['newscontent01', 'newscontent02']}), # for content from txt dict(attrs={'class': ['content']}), dict(attrs={'class': ['photo']}), dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[ '5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com # images for source from life.mingpao.com dict(name='img', attrs={ 'width': ['180'], 'alt':['????']}), # for images from txt dict(attrs={'class': ['images']}), dict(name='table', attrs={'width': ['100%'], 'cellspacing':[ '0'], 'cellpadding':['0'], 'border':['0']}) # content table in pda site ] if __KeepImages__: remove_tags = [dict(name='style'), # for the finance page from mpfinance.com dict(attrs={'id': ['newscontent135']}), # article date in life.mingpao.com article dict(name='font', attrs={ 'size': ['2'], 'color':['666666']}), # non-article images in life.mingpao.com article dict(name='img', attrs={ 'alt': ["明報網站", "按此列印", "關閉本視窗"]}), dict(name='img', attrs={ 'src': ["../image/top_2.gif"]}) # dict(name='table') # for content fetched from life.mingpao.com # dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) ] else: remove_tags = [dict(name='style'), # for the finance page from mpfinance.com dict(attrs={'id': ['newscontent135']}), # article date in life.mingpao.com article dict(name='font', attrs={ 'size': ['2'], 'color':['666666']}), dict(name='img'), # dict(name='table') # for content fetched from life.mingpao.com # dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) ] remove_attributes = ['width'] preprocess_regexps = [ (re.compile(r'
', re.DOTALL | re.IGNORECASE), lambda match: '

'), (re.compile(r'

', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(r'

', re.DOTALL | re.IGNORECASE), # for entertainment page lambda match: ''), # skip
after title in life.mingpao.com fetched article (re.compile(r"

", re.DOTALL | re.IGNORECASE), lambda match: "
"), (re.compile(r"

", re.DOTALL | re.IGNORECASE), lambda match: ""), (re.compile(r'


', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(r'', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(r'', re.DOTALL | re.IGNORECASE), lambda match: ''), # (re.compile(r'[
.+?]', re.DOTALL|re.IGNORECASE), # lambda match: '') ] elif __Region__ == 'Vancouver': if __UseChineseTitle__ is True: title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' else: title = 'Ming Pao - Vancouver' description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' category = 'Chinese, News, Vancouver' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ '3'], 'cellpadding':['3'], 'id':['tblContent3']}), dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), ] if __KeepImages__: # the magnifier icon remove_tags = [ dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] else: remove_tags = [dict(name='img')] remove_attributes = ['width'] preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), lambda match: ''), ] elif __Region__ == 'Toronto': if __UseChineseTitle__ is True: title = u'\u660e\u5831 (\u591a\u502b\u591a)' else: title = 'Ming Pao - Toronto' description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' category = 'Chinese, News, Toronto' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ '3'], 'cellpadding':['3'], 'id':['tblContent3']}), dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), ] if __KeepImages__: # the magnifier icon remove_tags = [ dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] else: remove_tags = [dict(name='img')] remove_attributes = ['width'] preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), lambda match: ''), ] oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' publisher = 'MingPao' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables': True} timefmt = '' def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() if __Region__ == 'Hong Kong': # convert UTC to local hk time - at HKT 4.30am, all news are # available dt_local = dt_utc + \ datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24) # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) elif __Region__ == 'Vancouver': # convert UTC to local Vancouver time - at PST time 5.30am, all # news are available dt_local = dt_utc + \ datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24) # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24) elif __Region__ == 'Toronto': # convert UTC to local Toronto time - at EST time 8.30am, all news # are available dt_local = dt_utc + \ datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24) # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24) return dt_local def get_fetchdate(self): if __Date__ != '': return __Date__ else: return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): if __Date__ != '': return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] else: return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchyear(self): if __Date__ != '': return __Date__[0:4] else: return self.get_dtlocal().strftime("%Y") def get_fetchmonth(self): if __Date__ != '': return __Date__[4:6] else: return self.get_dtlocal().strftime("%m") def get_fetchday(self): if __Date__ != '': return __Date__[6:8] else: return self.get_dtlocal().strftime("%d") # Note: does not work with custom date given by __Date__ def get_weekday(self): return self.get_dtlocal().weekday() def get_cover_url(self): if __Region__ == 'Hong Kong': cover = 'http://news1.mingpao.com/' + self.get_fetchdate() + '/' + \ self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' elif __Region__ == 'Vancouver': cover = 'http://www.mingpaovan.com/ftp/News/' + \ self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' elif __Region__ == 'Toronto': cover = 'http://www.mingpaotor.com/ftp/News/' + \ self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: cover = None return cover def parse_index(self): feeds = [] dateStr = self.get_fetchdate() if __Region__ == 'Hong Kong': if __UseLife__: for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'), (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'), (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'), (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalfa', 'nal'), (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalca', 'nal'), (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalta', 'nal'), (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') ]: if __InclPremium__ is True: articles = self.parse_section2_txt(url, keystr) else: articles = self.parse_section2(url, keystr) if articles: feeds.append((title, articles)) # new if __InclPremium__ is True and (self.get_weekday() != 6 or __ParseSelectedMobile__ is False): # if both not on Sunday and not __ParseSelectedMobile__, go ahead # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa ]: articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) if self.get_weekday() != 6: for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) else: if __InclPremium__ is True and __ParseSelectedMobile__ is True: articles = self.parse_section_mobile( 'http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1') if articles: feeds.append( (u'\u526f\u520a Supplement', articles)) else: for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) # end of new else: for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news1.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'), (u'\u6e2f\u805e Local', 'http://news1.mingpao.com/' + dateStr + '/gbindex.htm', 'gb'), (u'\u6559\u80b2 Education', 'http://news1.mingpao.com/' + dateStr + '/gfindex.htm', 'gf'), (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news1.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) # special- editorial # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') # if ed_articles: # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news1.mingpao.com/' + dateStr + '/faindex.htm', 'fa'), (u'\u4e2d\u570b China', 'http://news1.mingpao.com/' + dateStr + '/caindex.htm', 'ca'), (u'\u570b\u969b World', 'http://news1.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) # special - finance # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') # if fin_articles: # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) # for title, url in [('Tech News', 'http://news1.mingpao.com/' + dateStr + '/naindex.htm'), # (u'\u9ad4\u80b2 Sport', 'http://news1.mingpao.com/' + dateStr + '/spindex.htm')]: # articles = self.parse_section(url) # if articles: # feeds.append((title, articles)) # special - entertainment # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') # if ent_articles: # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') ]: articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) if __InclPremium__ is True and (self.get_weekday() != 6 or __ParseSelectedMobile__ is False): # if both not on Sunday or not __ParseSelectedMobile__, go ahead # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa ]: articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) if __InclPremium__ is False or self.get_weekday() != 6: for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) else: if __InclPremium__ is True and __ParseSelectedMobile__ is True: articles = self.parse_section_mobile( 'http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1') if articles: feeds.append( (u'\u526f\u520a Supplement', articles)) else: for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]: if __ParseTxt__ is False: articles = self.parse_section(url) else: articles = self.parse_section_txt(url, seckey) if articles: feeds.append((title, articles)) elif __Region__ == 'Vancouver': for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'), (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'), (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'), (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'), (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'), (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'), (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]: articles = self.parse_section3( url, 'http://www.mingpaovan.com/') if articles: feeds.append((title, articles)) elif __Region__ == 'Toronto': for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'), (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'), (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'), (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'), (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'), (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'), (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'), (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]: articles = self.parse_section3( url, 'http://www.mingpaotor.com/') if articles: feeds.append((title, articles)) return feeds # parse from news1.mingpao.com (web html) def parse_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']}) current_articles = [] included_urls = [] divs.reverse() for i in divs: a = i.find('a', href=True) title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news1.mingpao.com/' + dateStr + '/' + url # replace the url to the alternative version if __ParsePF__ is True: # printer-friendly option if url.rfind('Redirect') != -1 and __InclPremium__ is True: url = re.sub(dateStr + '.*' + dateStr, dateStr, url) url = re.sub('%2F.*%2F', '/', url) if __InclPremium__ is True: title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') url = url.replace('%2Etxt', '_print.htm') url = url.replace('%5F', '_') else: url = url.replace('.htm', '_print.htm') # if url not in included_urls and url.rfind('Redirect') == -1 and # (__InclPremium__ is False or # title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): if url not in included_urls and (__InclPremium__ is True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): current_articles.append( {'title': title, 'url': url, 'description': '', 'date': ''}) included_urls.append(url) current_articles.reverse() return current_articles # parse from news1.mingpao.com (txt) def parse_section_txt(self, url, ch): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']}) current_articles = [] included_urls = [] divs.reverse() for i in divs: a = i.find('a', href=True) title = self.tag_to_string(a) url = a.get('href', False) # print 'Base url: ', url # replace the url to the alternative version # text version if url.rfind('Redirect') != -1: url = 'http://news1.mingpao.com/' + dateStr + '/' + url # print 'original url: ', url url = re.sub( dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url) url = re.sub('%2F', '/', url) if __InclPremium__ is True: title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') url = url.replace('%2Etxt', '.txt') url = url.replace('%5F', '_') else: # get the first two char in url as ch seckey = url[0:2] url = url.replace('.htm', '.txt') url = 'http://news1.mingpao.com/ftp/WebNews2/' + \ dateStr + '/' + ch + '/' + seckey + '/' + url # print 'updated url: ', url if url not in included_urls and (__InclPremium__ is True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): # if url not in included_urls and (url.rfind('Redirect') == -1) # and (__InclPremium__ is False or # title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): current_articles.append( {'title': title, 'url': url, 'description': '', 'date': ''}) included_urls.append(url) current_articles.reverse() return current_articles # parse from life.mingpao.com def parse_section2(self, url, keystr): br = mechanize.Browser() br.set_handle_redirect(False) self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): try: br.open_novisit(url) # use printed version of the article url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') current_articles.append( {'title': title, 'url': url, 'description': ''}) included_urls.append(url) except: print 'skipping a premium article' current_articles.reverse() return current_articles # parse from text file of life.mingpao.com def parse_section2_txt(self, url, keystr): self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): # use printed version of the article url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') current_articles.append( {'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles # parse from mobile version def parse_section_mobile(self, base, page): soup = self.index_to_soup(base + '/' + page) a = soup.findAll('a', href=True) current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = i.get('href', False) if url not in included_urls and url.rfind('HotNews2.cfm') != -1: current_articles.append( {'title': title, 'url': base + '/' + url, 'description': ''}) included_urls.append(url) return current_articles # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) current_articles = [] included_urls = [] divs.reverse() for i in divs: title = self.tag_to_string(i) urlstr = i.get('href', False) urlstr = baseUrl + '/' + urlstr.replace('../../../', '') if urlstr not in included_urls: current_articles.append( {'title': title, 'url': urlstr, 'description': '', 'date': ''}) included_urls.append(urlstr) current_articles.reverse() return current_articles def parse_ed_section(self, url): self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): current_articles.append( {'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles def parse_fin_section(self, url): self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) current_articles = [] included_urls = [] for i in a: # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) # if url not in included_urls and not url.rfind(dateStr) == -1 and # url.rfind('index') == -1: if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): title = self.tag_to_string(i) current_articles.append( {'title': title, 'url': url, 'description': ''}) included_urls.append(url) return current_articles def parse_ent_section(self, url): self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): current_articles.append( {'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles def parse_col_section(self, url): self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): current_articles.append( {'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles # preprocess those .txt and javascript based files def preprocess_raw_html(self, raw_html, url): new_html = raw_html if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1: if url.rfind('_print.htm') != -1: # javascript based file splitter = re.compile(r'\n') new_raw_html = 'Untitled' new_raw_html = new_raw_html + '' for item in splitter.split(raw_html): if item.startswith('var heading1 ='): heading = item.replace('var heading1 = \'', '') heading = heading.replace('\'', '') heading = heading.replace(';', '') new_raw_html = new_raw_html + '
' + heading if item.startswith('var heading2 ='): heading = item.replace('var heading2 = \'', '') heading = heading.replace('\'', '') heading = heading.replace(';', '') if heading != '': new_raw_html = new_raw_html + '
' + heading + '
' else: new_raw_html = new_raw_html + '
' if item.startswith('var content ='): content = item.replace("var content = ", '') content = content.replace('\'', '') content = content.replace(';', '') new_raw_html = new_raw_html + '
' + content + '
' if item.startswith('var photocontent ='): photo = item.replace('var photocontent = \'', '') photo = photo.replace('\'', '') photo = photo.replace(';', '') photo = photo.replace('', '') photo = photo.replace('', '') photo = photo.replace('', '') photo = photo.replace('', '
') photo = photo.replace('class="photo"', '') new_raw_html = new_raw_html + '
' + photo + '
' new_html = new_raw_html + '' else: # .txt based file splitter = re.compile(r'\n') # Match non-digits new_raw_html = 'Untitled
' next_is_img_txt = False title_started = False title_break_reached = False met_article_start_char = False for item in splitter.split(raw_html): item = item.strip() # if title already reached but break between title and # content not yet found, record title_break_reached if title_started is True and title_break_reached is False and item == '': title_break_reached = True # if title reached and title_break_reached and met_article_start_char is False and item is not empty # start content elif title_started is True and title_break_reached is True and met_article_start_char is False: if item != '': met_article_start_char = True new_raw_html = new_raw_html + '

' + item + '

\n' # if item.startswith(u'\u3010'): # met_article_start_char = True # new_raw_html = new_raw_html + '

' + item + '

\n' else: if next_is_img_txt is False: if item.startswith("=@"): print 'skip movie link' elif item.startswith("=?"): next_is_img_txt = True new_raw_html += '

\n' elif item.startswith('=='): next_is_img_txt = True if False: # TODO: check existence of .gif first newimg = '_' + item[2:].strip() + '.jpg' new_raw_html += '

\n' else: new_raw_html += '

\n' elif item.startswith('='): next_is_img_txt = True if False: # TODO: check existence of .gif first newimg = '_' + item[1:].strip() + '.jpg' new_raw_html += '

\n' else: new_raw_html += '

\n' else: if next_is_img_txt is False and met_article_start_char is False: if item != '': if title_started is False: # print 'Title started at ', item new_raw_html = new_raw_html + '

' + item + '\n' title_started = True else: new_raw_html = new_raw_html + item + '\n' else: new_raw_html = new_raw_html + item + '

\n' else: next_is_img_txt = False new_raw_html = new_raw_html + item + '\n' new_html = new_raw_html + '

' if __HiResImg__ is True: # TODO: add a _ in front of an image url if url.rfind('news1.mingpao.com') > -1: imglist = re.findall('src="?.*?jpg"', new_html) br = mechanize.Browser() br.set_handle_redirect(False) for img in imglist: gifimg = img.replace('jpg"', 'gif"') try: br.open_novisit( url + "/../" + gifimg[5:len(gifimg) - 1]) new_html = new_html.replace(img, gifimg) except: if __ParseTxt__ is False: # find the location of the first _ pos = img.find('_') if pos > -1: # if found, insert _ after the first _ newimg = img[0:pos] + '_' + img[pos:] new_html = new_html.replace(img, newimg) else: # if not found, insert _ after " new_html = new_html.replace( img[1:], '"_' + img[1:]) else: # insert to front # print 'imgstr: ', img pos = img.find('_') new_html = new_html.replace(img[5:], '_' + img[5:]) elif url.rfind('life.mingpao.com') > -1: imglist = re.findall('src=\'?.*?jpg\'', new_html) br = mechanize.Browser() br.set_handle_redirect(False) # print 'Img list: ', imglist, '\n' for img in imglist: # print 'Found img: ', img gifimg = img.replace('jpg\'', 'gif\'') try: gifurl = re.sub(r'dailynews.*txt', '', url) br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) new_html = new_html.replace(img, gifimg) except: pos = img.rfind('/') newimg = img[0:pos + 1] + '_' + img[pos + 1:] new_html = new_html.replace(img, newimg) # repeat with src quoted by double quotes, for text parsed from # src txt imglist = re.findall('src="?.*?jpg"', new_html) for img in imglist: # print 'Found img: ', img gifimg = img.replace('jpg"', 'gif"') try: # print 'url', url pos = url.rfind('/') gifurl = url[:pos + 1] # print 'try it:', gifurl + gifimg[5:len(gifimg)-1] br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) new_html = new_html.replace(img, gifimg) except: pos = img.find('"') newimg = img[0:pos + 1] + '_' + img[pos + 1:] # print 'Use hi-res img', newimg new_html = new_html.replace(img, newimg) # test # print new_html return new_html def preprocess_html(self, soup): for mobiletitle in soup.findAll('font', attrs={'color': ['navy']}): mobiletitle.name = 'h1' for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(style=True): del item['width'] for item in soup.findAll(stype=True): del item['absmiddle'] return soup def populate_article_metadata(self, article, soup, first): if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'): img = soup.find('img') if img is not None: self.add_toc_thumbnail(article, img['src']) try: if __IncludeSummary__ and len(article.text_summary.strip()) == 0: # look for content articlebodies = soup.findAll( 'div', attrs={'id': 'newscontent'}) if not articlebodies: articlebodies = soup.findAll( 'div', attrs={'id': 'newscontent01'}) if not articlebodies: articlebodies = soup.findAll( 'div', attrs={'class': 'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id': 'font'}) if articlebodies: for articlebody in articlebodies: if articlebody: # the text may or may not be enclosed in

# tag paras = articlebody.findAll('p') if not paras: paras = articlebody textFound = False for p in paras: if not textFound: summary_candidate = self.tag_to_string( p).strip() summary_candidate = summary_candidate.replace( u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1) if len(summary_candidate) > 0: article.summary = article.text_summary = summary_candidate textFound = True else: # display a simple text # article.summary = article.text_summary = u'\u66f4\u591a......' # display word counts counts = 0 articlebodies = soup.findAll( 'div', attrs={'id': 'newscontent'}) if not articlebodies: articlebodies = soup.findAll( 'div', attrs={'id': 'newscontent01'}) if not articlebodies: articlebodies = soup.findAll( 'div', attrs={'class': 'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id': 'font'}) if articlebodies: for articlebody in articlebodies: # the text may or may not be enclosed in

tag paras = articlebody.findAll('p') if not paras: paras = articlebody for p in paras: summary_candidate = self.tag_to_string(p).strip() counts += len(summary_candidate) article.summary = article.text_summary = u'\uff08' + \ str(counts) + u'\u5b57\uff09' except: self.log("Error creating article descriptions") return # override from the one in version 0.8.31 def create_opf(self, feeds, dir=None): if dir is None: dir = self.output_dir title = self.short_title() # change 1: allow our own flag to tell if a periodical is to be generated # also use customed date instead of current time if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title: title = title + ' ' + self.get_fetchformatteddate() # end of change 1 # change 2: __appname__ replaced by newspaper publisher __appname__ = self.publisher mi = MetaInformation(title, [__appname__]) mi.publisher = __appname__ mi.author_sort = __appname__ # change 3: use __MakePeriodical__ flag to tell if a periodical should # be generated if __MakePeriodical__ is True: mi.publication_type = 'periodical:' + \ self.publication_type + ':' + self.short_title() else: mi.publication_type = self.publication_type + ':' + self.short_title() # change 4: in the following, all the nowf() are changed to adjusted time # This one doesn't matter mi.timestamp = nowf() # change 5: skip listing the articles # article_titles, aseen = [], set() # for f in feeds: # for a in f: # if a.title and a.title not in aseen: # aseen.add(a.title) # article_titles.append(force_unicode(a.title, 'utf-8')) # mi.comments = self.description # if not isinstance(mi.comments, unicode): # mi.comments = mi.comments.decode('utf-8', 'replace') # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + # '\n\n'.join(article_titles)) language = canonicalize_lang(self.language) if language is not None: mi.language = language # This one affects the pub date shown in kindle title # mi.pubdate = nowf() # now appears to need the time field to be > 12.00noon as well mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) # Add mastheadImage entry to section mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide ref = Guide.Reference(os.path.basename( self.masthead_path), os.getcwdu()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) manifest = [os.path.join(dir, 'feed_%d' % i) for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) # Get cover cpath = getattr(self, 'cover_path', None) if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) # Get masthead mpath = getattr(self, 'masthead_path', None) if mpath is not None and os.access(mpath, os.R_OK): manifest.append(mpath) opf.create_manifest_from_files_in(manifest) for mani in opf.manifest: if mani.path.endswith('.ncx'): mani.id = 'ncx' if mani.path.endswith('mastheadImage.jpg'): mani.id = 'masthead-image' entries = ['index.html'] toc = TOC(base_path=dir) self.play_order_counter = 0 self.play_order_map = {} def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/' % (num, j) auth = a.author if not auth: auth = None desc = a.text_summary if not desc: desc = None else: desc = self.description_limiter(desc) tt = a.toc_thumbnail if a.toc_thumbnail else None entries.append('%sindex.html' % adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter parent.add_item('%sindex.html' % adir, None, a.title if a.title else _( 'Untitled Article'), play_order=po, author=auth, description=desc, toc_thumbnail=tt) last = os.path.join( self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) last = sp if os.path.exists(last): with open(last, 'rb') as fi: src = fi.read().decode('utf-8') soup = BeautifulSoup(src) body = soup.find('body') if body is not None: prefix = '/'.join('..'for i in range(2 * len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, a.orig_url, __appname__, prefix=prefix, center=self.center_navbar) elem = BeautifulSoup(templ.render( doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(unicode(soup).encode('utf-8')) if len(feeds) == 0: raise Exception('All feeds are empty, aborting.') if len(feeds) > 1: for i, f in enumerate(feeds): entries.append('feed_%d/index.html' % i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter auth = getattr(f, 'author', None) if not auth: auth = None desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html' % i, None, f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html' % 0) feed_index(0, toc) for i, p in enumerate(entries): entries[i] = os.path.join(dir, p.replace('/', os.sep)) opf.create_spine(entries) opf.set_toc(toc) with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file)