diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index 856d7166ff..da7272ca2e 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -4,26 +4,27 @@ __copyright__ = '2010-2011, Eddie Lau' # Region - Hong Kong, Vancouver, Toronto __Region__ = 'Hong Kong' # Users of Kindle 3 with limited system-level CJK support -# please replace the following "True" with "False". +# please replace the following "True" with "False". (Default: True) __MakePeriodical__ = True -# Turn below to True if your device supports display of CJK titles +# Turn below to True if your device supports display of CJK titles (Default: False) __UseChineseTitle__ = False -# Set it to False if you want to skip images +# Set it to False if you want to skip images (Default: True) __KeepImages__ = True -# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source +# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) __UseLife__ = True -# (HK only) It is to disable the column section which is now a premium content -__InclCols__ = False -# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats -__ParsePFF__ = False -# (HK only) Turn below to True if you wish hi-res images +# (HK only) It is to disable premium content (Default: False) +__InclPremium__ = False +# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True) +__ParsePFF__ = True +# (HK only) Turn below to True if you wish hi-res images (Default: False) __HiResImg__ = False ''' Change Log: +2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/04: option to get hi-res photos for the articles -2011/09/21: fetching "column" section is made optional. +2011/09/21: fetching "column" section is made optional. 2011/09/18: parse "column" section stuff from source text file directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source @@ -72,7 +73,7 @@ class MPRecipe(BasicNewsRecipe): dict(attrs={'class':['content']}), # for content from txt dict(attrs={'class':['photo']}), dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com - dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com + dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com dict(attrs={'class':['images']}) # for images from txt ] if __KeepImages__: @@ -208,18 +209,21 @@ class MPRecipe(BasicNewsRecipe): (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') ]: - articles = self.parse_section2(url, keystr) + if __InclPremium__ == True: + articles = self.parse_section2_txt(url, keystr) + else: + articles = self.parse_section2(url, keystr) if articles: feeds.append((title, articles)) - if __InclCols__ == True: + if __InclPremium__ == True: # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') ]: articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -253,10 +257,10 @@ class MPRecipe(BasicNewsRecipe): # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: - articles = self.parse_section2(url, keystr) + articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: # articles = self.parse_section(url) @@ -270,18 +274,18 @@ class MPRecipe(BasicNewsRecipe): for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') ]: - articles = self.parse_section2(url, keystr) + articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - - if __InclCols__ == True: + + if __InclPremium__ == True: # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') ]: articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -333,7 +337,7 @@ class MPRecipe(BasicNewsRecipe): url = 'http://news.mingpao.com/' + dateStr + '/' +url # replace the url to the print-friendly version if __ParsePFF__ == True: - if url.rfind('Redirect') <> -1: + if url.rfind('Redirect') <> -1 and __InclPremium__ == True: url = re.sub(dateStr + '.*' + dateStr, dateStr, url) url = re.sub('%2F.*%2F', '/', url) title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') @@ -349,6 +353,8 @@ class MPRecipe(BasicNewsRecipe): # parse from life.mingpao.com def parse_section2(self, url, keystr): + br = mechanize.Browser() + br.set_handle_redirect(False) self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) @@ -359,9 +365,13 @@ class MPRecipe(BasicNewsRecipe): title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article - current_articles.append({'title': title, 'url': url, 'description': ''}) - included_urls.append(url) + try: + br.open_novisit(url) + url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + except: + print 'skipping a premium article' current_articles.reverse() return current_articles @@ -382,7 +392,7 @@ class MPRecipe(BasicNewsRecipe): included_urls.append(url) current_articles.reverse() return current_articles - + # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() @@ -470,23 +480,23 @@ class MPRecipe(BasicNewsRecipe): #raw_html = raw_html.replace(u'
\u3010', u'\u3010') if __HiResImg__ == True: # TODO: add a _ in front of an image url - if url.rfind('news.mingpao.com') > -1: + if url.rfind('news.mingpao.com') > -1: imglist = re.findall('src="?.*?jpg"', raw_html) br = mechanize.Browser() br.set_handle_redirect(False) for img in imglist: gifimg = img.replace('jpg"', 'gif"') - try: + try: br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) raw_html = raw_html.replace(img, gifimg) - except: + except: # find the location of the first _ pos = img.find('_') if pos > -1: # if found, insert _ after the first _ newimg = img[0:pos] + '_' + img[pos:] raw_html = raw_html.replace(img, newimg) - else: + else: # if not found, insert _ after " raw_html = raw_html.replace(img[1:], '"_' + img[1:]) elif url.rfind('life.mingpao.com') > -1: @@ -510,7 +520,7 @@ class MPRecipe(BasicNewsRecipe): pos = img.rfind('/') newimg = img[0:pos+1] + '_' + img[pos+1:] #print 'newimg: ', newimg - raw_html = raw_html.replace(img, newimg) + raw_html = raw_html.replace(img, newimg) if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1: return raw_html else: @@ -549,10 +559,11 @@ class MPRecipe(BasicNewsRecipe): photo = photo.replace('class="photo"', '') new_raw_html = new_raw_html + '