diff --git a/recipes/ming_pao_toronto.recipe b/recipes/ming_pao_toronto.recipe deleted file mode 100644 index 24792ae76a..0000000000 --- a/recipes/ming_pao_toronto.recipe +++ /dev/null @@ -1,1018 +0,0 @@ -from __future__ import print_function -__license__ = 'GPL v3' -__copyright__ = '2010-2011, Eddie Lau' - -# Region - Hong Kong, Vancouver, Toronto -__Region__ = 'Toronto' -# Users of Kindle 3 with limited system-level CJK support -# please replace the following "True" with "False". (Default: True) -__MakePeriodical__ = True -# Turn below to True if your device supports display of CJK titles -# (Default: False) -__UseChineseTitle__ = False -# Set it to False if you want to skip images (Default: True) -__KeepImages__ = True -# Set it to True if you want to include a summary in Kindle's article view -# (Default: False) -__IncludeSummary__ = False -# Set it to True if you want thumbnail images in Kindle's article view -# (Default: True) -__IncludeThumbnails__ = True -# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) -__UseLife__ = True -# (HK only) It is to disable premium content (Default: False) -__InclPremium__ = False -# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True) -__ParsePFF__ = True -# (HK only) Turn below to True if you wish hi-res images (Default: False) -__HiResImg__ = False -# Override the date returned by the program if specifying a YYYYMMDD below -__Date__ = '' - - -''' -Change Log: -2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away - from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day - download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. -2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' -2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt -2011/10/19: fix a bug in txt source parsing -2011/10/17: disable fetching of premium content, also improved txt source parsing -2011/10/04: option to get hi-res photos for the articles -2011/09/21: fetching "column" section is made optional. -2011/09/18: parse "column" section stuff from source text file directly. -2011/09/07: disable "column" section as it is no longer offered free. -2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source - provide options to remove all images in the file -2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages -2011/03/06: add new articles for finance section, also a new section "Columns" -2011/02/28: rearrange the sections - [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles - View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" - folder in Kindle 3 -2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles - clean up the indentation -2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list - (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) -2010/11/22: add English section, remove eco-news section which is not updated daily, correct - ordering of articles -2010/11/12: add news image and eco-news section -2010/11/08: add parsing of finance section -2010/11/06: temporary work-around for Kindle device having no capability to display unicode - in section/article list. -2010/10/31: skip repeated articles in section pages -''' - -from calibre.utils.date import now as nowf -import os -import datetime -import re -import mechanize -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata import MetaInformation -from calibre.utils.localization import canonicalize_lang - -# MAIN CLASS - - -class MPRecipe(BasicNewsRecipe): - if __Region__ == 'Hong Kong': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u9999\u6e2f)' - else: - title = 'Ming Pao - Hong Kong' - description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' - category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' # noqa - masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' - keep_only_tags = [dict(name='h1'), - # for entertainment page title - dict(name='font', attrs={ - 'style': ['font-size:14pt; line-height:160%;']}), - # for column articles title - dict(name='font', attrs={'color': ['AA0000']}), - # for heading from txt - dict(attrs={'class': ['heading']}), - # entertainment and column page content - dict(attrs={'id': ['newscontent']}), - dict( - attrs={'id': ['newscontent01', 'newscontent02']}), - # for content from txt - dict(attrs={'class': ['content']}), - dict(attrs={'class': ['photo']}), - dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[ - '5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com - # images for source from life.mingpao.com - dict(name='img', attrs={ - 'width': ['180'], 'alt':['????']}), - # for images from txt - dict(attrs={'class': ['images']}) - ] - if __KeepImages__: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - # dict(name='table') # for content fetched from - # life.mingpao.com - ] - else: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - dict(name='img'), - # dict(name='table') # for content fetched from - # life.mingpao.com - ] - remove_attributes = ['width'] - preprocess_regexps = [ - (re.compile(r'
', re.DOTALL | re.IGNORECASE), - lambda match: '

'), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), - lambda match: ''), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), # for entertainment page - lambda match: ''), - # skip
after title in life.mingpao.com fetched article - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: "
"), - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: "") - ] - elif __Region__ == 'Vancouver': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' - else: - title = 'Ming Pao - Vancouver' - description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' - category = 'Chinese, News, Vancouver' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - elif __Region__ == 'Toronto': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u591a\u502b\u591a)' - else: - title = 'Ming Pao - Toronto' - description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' - category = 'Chinese, News, Toronto' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - - oldest_article = 1 - max_articles_per_feed = 100 - __author__ = 'Eddie Lau' - publisher = 'MingPao' - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - language = 'zh' - encoding = 'Big5-HKSCS' - recursions = 0 - conversion_options = {'linearize_tables': True} - timefmt = '' - - def get_dtlocal(self): - dt_utc = datetime.datetime.utcnow() - if __Region__ == 'Hong Kong': - # convert UTC to local hk time - at HKT 4.30am, all news are - # available - dt_local = dt_utc + \ - datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) - elif __Region__ == 'Vancouver': - # convert UTC to local Vancouver time - at PST time 5.30am, all - # news are available - dt_local = dt_utc + \ - datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24) - elif __Region__ == 'Toronto': - # convert UTC to local Toronto time - at EST time 8.30am, all news - # are available - dt_local = dt_utc + \ - datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24) - return dt_local - - def get_fetchdate(self): - if __Date__ != '': - return __Date__ - else: - return self.get_dtlocal().strftime("%Y%m%d") - - def get_fetchformatteddate(self): - if __Date__ != '': - return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] - else: - return self.get_dtlocal().strftime("%Y-%m-%d") - - def get_fetchyear(self): - if __Date__ != '': - return __Date__[0:4] - else: - return self.get_dtlocal().strftime("%Y") - - def get_fetchmonth(self): - if __Date__ != '': - return __Date__[4:6] - else: - return self.get_dtlocal().strftime("%m") - - def get_fetchday(self): - if __Date__ != '': - return __Date__[6:8] - else: - return self.get_dtlocal().strftime("%d") - - def get_cover_url(self): - if __Region__ == 'Hong Kong': - cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + \ - '_' + self.get_fetchday() + 'gacov.jpg' - elif __Region__ == 'Vancouver': - cover = 'http://www.mingpaovan.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' - elif __Region__ == 'Toronto': - cover = 'http://www.mingpaotor.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - cover = None - return cover - - def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - - if __Region__ == 'Hong Kong': - if __UseLife__: - for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), - (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgb', 'nal'), - (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgf', 'nal'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', - 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'), - (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalfa', 'nal'), - (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalca', 'nal'), - (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalta', 'nal'), - (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalea', 'nal'), - (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalsp', 'nal'), - (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalma', 'nal') - ]: - if __InclPremium__ is True: - articles = self.parse_section2_txt(url, keystr) - else: - articles = self.parse_section2(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is True: - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - else: - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), - (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + - dateStr + '/gbindex.htm'), - (u'\u6559\u80b2 Education', - 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special- editorial - # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') - # if ed_articles: - # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) - - for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), - (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + - dateStr + '/caindex.htm'), - (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special - finance - # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') - # if fin_articles: - # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - - for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - # for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: - # articles = self.parse_section(url) - # if articles: - # feeds.append((title, articles)) - - # special - entertainment - # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - # if ent_articles: - # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) - - for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is True: - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - elif __Region__ == 'Vancouver': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VBindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VDindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-VGindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VTindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VCindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaovan.com/') - if articles: - feeds.append((title, articles)) - elif __Region__ == 'Toronto': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TDindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TFindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TCAindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TTAindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-GAindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaotor.com/') - if articles: - feeds.append((title, articles)) - return feeds - - # parse from news.mingpao.com - def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - a = i.find('a', href=True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' + url - # replace the url to the print-friendly version - if __ParsePFF__ is True: - if url.rfind('Redirect') != -1 and __InclPremium__ is True: - url = re.sub(dateStr + '.*' + dateStr, dateStr, url) - url = re.sub('%2F.*%2F', '/', url) - title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') - url = url.replace('%2Etxt', '_print.htm') - url = url.replace('%5F', '_') - else: - url = url.replace('.htm', '_print.htm') - if url not in included_urls and url.rfind('Redirect') == -1: - current_articles.append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from life.mingpao.com - def parse_section2(self, url, keystr): - br = mechanize.Browser() - br.set_handle_redirect(False) - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - try: - br.open_novisit(url) - # use printed version of the article - url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - except: - print('skipping a premium article') - current_articles.reverse() - return current_articles - - # parse from text file of life.mingpao.com - def parse_section2_txt(self, url, keystr): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - # use printed version of the article - url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from www.mingpaovan.com - def parse_section3(self, url, baseUrl): - self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - title = self.tag_to_string(i) - urlstr = i.get('href', False) - urlstr = baseUrl + '/' + urlstr.replace('../../../', '') - if urlstr not in included_urls: - current_articles.append( - {'title': title, 'url': urlstr, 'description': '', 'date': ''}) - included_urls.append(urlstr) - current_articles.reverse() - return current_articles - - def parse_ed_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_fin_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - current_articles = [] - included_urls = [] - for i in a: - # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - # if url not in included_urls and not url.rfind(dateStr) == -1 and - # url.rfind('index') == -1: - if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): - title = self.tag_to_string(i) - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - return current_articles - - def parse_ent_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_col_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # preprocess those .txt and javascript based files - def preprocess_raw_html(self, raw_html, url): - new_html = raw_html - if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1: - if url.rfind('_print.htm') != -1: - # javascript based file - splitter = re.compile(r'\n') - new_raw_html = 'Untitled' - new_raw_html = new_raw_html + '' - for item in splitter.split(raw_html): - if item.startswith('var heading1 ='): - heading = item.replace('var heading1 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - new_raw_html = new_raw_html + '
' + heading - if item.startswith('var heading2 ='): - heading = item.replace('var heading2 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - if heading != '': - new_raw_html = new_raw_html + '
' + heading + '
' - else: - new_raw_html = new_raw_html + '
' - if item.startswith('var content ='): - content = item.replace("var content = ", '') - content = content.replace('\'', '') - content = content.replace(';', '') - new_raw_html = new_raw_html + '
' + content + '
' - if item.startswith('var photocontent ='): - photo = item.replace('var photocontent = \'', '') - photo = photo.replace('\'', '') - photo = photo.replace(';', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '
') - photo = photo.replace('class="photo"', '') - new_raw_html = new_raw_html + '
' + photo + '
' - new_html = new_raw_html + '' - else: - # .txt based file - splitter = re.compile(r'\n') # Match non-digits - new_raw_html = 'Untitled
' - next_is_img_txt = False - title_started = False - title_break_reached = False - met_article_start_char = False - for item in splitter.split(raw_html): - item = item.strip() - # if title already reached but break between title and - # content not yet found, record title_break_reached - if title_started is True and title_break_reached is False and item == '': - title_break_reached = True - # if title reached and title_break_reached and met_article_start_char is False and item is not empty - # start content - elif title_started is True and title_break_reached is True and met_article_start_char is False: - if item != '': - met_article_start_char = True - new_raw_html = new_raw_html + '

' + item + '

\n' - # if item.startswith(u'\u3010'): - # met_article_start_char = True - # new_raw_html = new_raw_html + '

' + item + '

\n' - else: - if next_is_img_txt is False: - if item.startswith("=@"): - print('skip movie link') - elif item.startswith("=?"): - next_is_img_txt = True - new_raw_html += '

\n' - elif item.startswith('=='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[2:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - elif item.startswith('='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[1:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - else: - if next_is_img_txt is False and met_article_start_char is False: - if item != '': - if title_started is False: - # print 'Title started at ', item - new_raw_html = new_raw_html + '

' + item + '\n' - title_started = True - else: - new_raw_html = new_raw_html + item + '\n' - else: - new_raw_html = new_raw_html + item + '

\n' - else: - next_is_img_txt = False - new_raw_html = new_raw_html + item + '\n' - new_html = new_raw_html + '

' - if __HiResImg__ is True: - # TODO: add a _ in front of an image url - if url.rfind('news.mingpao.com') > -1: - imglist = re.findall('src="?.*?jpg"', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - for img in imglist: - gifimg = img.replace('jpg"', 'gif"') - try: - br.open_novisit( - url + "/../" + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - # find the location of the first _ - pos = img.find('_') - if pos > -1: - # if found, insert _ after the first _ - newimg = img[0:pos] + '_' + img[pos:] - new_html = new_html.replace(img, newimg) - else: - # if not found, insert _ after " - new_html = new_html.replace( - img[1:], '"_' + img[1:]) - elif url.rfind('life.mingpao.com') > -1: - imglist = re.findall('src=\'?.*?jpg\'', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - # print 'Img list: ', imglist, '\n' - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg\'', 'gif\'') - try: - gifurl = re.sub(r'dailynews.*txt', '', url) - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.rfind('/') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - new_html = new_html.replace(img, newimg) - # repeat with src quoted by double quotes, for text parsed from - # src txt - imglist = re.findall('src="?.*?jpg"', new_html) - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg"', 'gif"') - try: - # print 'url', url - pos = url.rfind('/') - gifurl = url[:pos + 1] - # print 'try it:', gifurl + gifimg[5:len(gifimg)-1] - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.find('"') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - # print 'Use hi-res img', newimg - new_html = new_html.replace(img, newimg) - return new_html - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(style=True): - del item['width'] - for item in soup.findAll(stype=True): - del item['absmiddle'] - return soup - - def populate_article_metadata(self, article, soup, first): - # thumbnails shouldn't be available if using hi-res images - if __IncludeThumbnails__ and __HiResImg__ is False and first and hasattr(self, 'add_toc_thumbnail'): - img = soup.find('img') - if img is not None: - self.add_toc_thumbnail(article, img['src']) - - try: - if __IncludeSummary__ and len(article.text_summary.strip()) == 0: - # look for content - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - if articlebody: - # the text may or may not be enclosed in

- # tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - textFound = False - for p in paras: - if not textFound: - summary_candidate = self.tag_to_string( - p).strip() - summary_candidate = summary_candidate.replace( - u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1) - if len(summary_candidate) > 0: - article.summary = article.text_summary = summary_candidate - textFound = True - else: - # display a simple text - # article.summary = article.text_summary = u'\u66f4\u591a......' - # display word counts - counts = 0 - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - # the text may or may not be enclosed in

tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - for p in paras: - summary_candidate = self.tag_to_string(p).strip() - counts += len(summary_candidate) - article.summary = article.text_summary = u'\uff08' + \ - str(counts) + u'\u5b57\uff09' - except: - self.log("Error creating article descriptions") - return - - # override from the one in version 0.8.31 - def create_opf(self, feeds, dir=None): - if dir is None: - dir = self.output_dir - title = self.short_title() - # change 1: allow our own flag to tell if a periodical is to be generated - # also use customed date instead of current time - if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title: - title = title + ' ' + self.get_fetchformatteddate() - # end of change 1 - # change 2: __appname__ replaced by newspaper publisher - __appname__ = self.publisher - mi = MetaInformation(title, [__appname__]) - mi.publisher = __appname__ - mi.author_sort = __appname__ - # change 3: use __MakePeriodical__ flag to tell if a periodical should - # be generated - if __MakePeriodical__ is True: - mi.publication_type = 'periodical:' + \ - self.publication_type + ':' + self.short_title() - else: - mi.publication_type = self.publication_type + ':' + self.short_title() - # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() - # change 4: in the following, all the nowf() are changed to adjusted time - # This one doesn't matter - mi.timestamp = nowf() - # change 5: skip listing the articles - # article_titles, aseen = [], set() - # for f in feeds: - # for a in f: - # if a.title and a.title not in aseen: - # aseen.add(a.title) - # article_titles.append(force_unicode(a.title, 'utf-8')) - - # mi.comments = self.description - # if not isinstance(mi.comments, unicode): - # mi.comments = mi.comments.decode('utf-8', 'replace') - # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + - # '\n\n'.join(article_titles)) - - language = canonicalize_lang(self.language) - if language is not None: - mi.language = language - # This one affects the pub date shown in kindle title - # mi.pubdate = nowf() - # now appears to need the time field to be > 12.00noon as well - mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( - self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) - opf_path = os.path.join(dir, 'index.opf') - ncx_path = os.path.join(dir, 'index.ncx') - - opf = OPFCreator(dir, mi) - # Add mastheadImage entry to section - mp = getattr(self, 'masthead_path', None) - if mp is not None and os.access(mp, os.R_OK): - from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename( - self.masthead_path), os.getcwd()) - ref.type = 'masthead' - ref.title = 'Masthead Image' - opf.guide.append(ref) - - manifest = [os.path.join(dir, 'feed_%d' % i) - for i in range(len(feeds))] - manifest.append(os.path.join(dir, 'index.html')) - manifest.append(os.path.join(dir, 'index.ncx')) - - # Get cover - cpath = getattr(self, 'cover_path', None) - if cpath is None: - pf = open(os.path.join(dir, 'cover.jpg'), 'wb') - if self.default_cover(pf): - cpath = pf.name - if cpath is not None and os.access(cpath, os.R_OK): - opf.cover = cpath - manifest.append(cpath) - - # Get masthead - mpath = getattr(self, 'masthead_path', None) - if mpath is not None and os.access(mpath, os.R_OK): - manifest.append(mpath) - - opf.create_manifest_from_files_in(manifest) - for mani in opf.manifest: - if mani.path.endswith('.ncx'): - mani.id = 'ncx' - if mani.path.endswith('mastheadImage.jpg'): - mani.id = 'masthead-image' - - entries = ['index.html'] - toc = TOC(base_path=dir) - self.play_order_counter = 0 - self.play_order_map = {} - - def feed_index(num, parent): - f = feeds[num] - for j, a in enumerate(f): - if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/' % (num, j) - auth = a.author - if not auth: - auth = None - desc = a.text_summary - if not desc: - desc = None - else: - desc = self.description_limiter(desc) - tt = a.toc_thumbnail if a.toc_thumbnail else None - entries.append('%sindex.html' % adir) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - parent.add_item('%sindex.html' % adir, None, - a.title if a.title else _( - 'Untitled Article'), - play_order=po, author=auth, - description=desc, toc_thumbnail=tt) - last = os.path.join( - self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) - for sp in a.sub_pages: - prefix = os.path.commonprefix([opf_path, sp]) - relp = sp[len(prefix):] - entries.append(relp.replace(os.sep, '/')) - last = sp - - if os.path.exists(last): - with open(last, 'rb') as fi: - src = fi.read().decode('utf-8') - soup = BeautifulSoup(src) - body = soup.find('body') - if body is not None: - prefix = '/'.join('..'for i in range(2 * - len(re.findall(r'link\d+', last)))) - templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - elem = BeautifulSoup(templ.render( - doctype='xhtml').decode('utf-8')).find('div') - body.insert(len(body.contents), elem) - with open(last, 'wb') as fi: - fi.write(type(u'')(soup).encode('utf-8')) - if len(feeds) == 0: - raise Exception('All feeds are empty, aborting.') - - if len(feeds) > 1: - for i, f in enumerate(feeds): - entries.append('feed_%d/index.html' % i) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - auth = getattr(f, 'author', None) - if not auth: - auth = None - desc = getattr(f, 'description', None) - if not desc: - desc = None - feed_index(i, toc.add_item('feed_%d/index.html' % i, None, - f.title, play_order=po, description=desc, author=auth)) - - else: - entries.append('feed_%d/index.html' % 0) - feed_index(0, toc) - - for i, p in enumerate(entries): - entries[i] = os.path.join(dir, p.replace('/', os.sep)) - opf.create_spine(entries) - opf.set_toc(toc) - - with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: - opf.render(opf_file, ncx_file)