From 6e4908882c7e7a5eb14a70c56617d54701b3471a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Sep 2011 16:56:52 -0600 Subject: [PATCH] Updated Ming Pao - HK --- recipes/ming_pao.recipe | 68 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index 7060a7cd3e..ef8ad98bb9 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -16,6 +16,7 @@ __UseLife__ = True ''' Change Log: +2011/09/18: parse "column" section stuff from source text files directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source provide options to remove all images in the file @@ -52,16 +53,19 @@ class MPRecipe(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'color':['AA0000']}), # for column articles title + dict(attrs={'class':['heading']}), # for heading from txt dict(attrs={'id':['newscontent']}), # entertainment and column page content dict(attrs={'id':['newscontent01','newscontent02']}), + dict(attrs={'class':['content']}), # for content from txt dict(attrs={'class':['photo']}), dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com - dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com + dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com + dict(attrs={'class':['images']}) # for images from txt ] if __KeepImages__: remove_tags = [dict(name='style'), @@ -232,12 +236,18 @@ class MPRecipe(BasicNewsRecipe): (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') - #(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') ]: articles = self.parse_section2(url, keystr) if articles: feeds.append((title, articles)) + # parse column section articles directly from .txt files + for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') + ]: + articles = self.parse_section2_col(url, keystr) + if articles: + feeds.append((title, articles)) + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -358,6 +368,24 @@ class MPRecipe(BasicNewsRecipe): current_articles.reverse() return current_articles + # parse from life.mingpao.com + def parse_section2_col(self, url, keystr): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): + url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() @@ -440,6 +468,39 @@ class MPRecipe(BasicNewsRecipe): current_articles.reverse() return current_articles + # preprocess those .txt based files + def preprocess_raw_html(self, raw_html, url): + if url.rfind('ftp') == -1: + return raw_html + else: + splitter = re.compile(r'\n') # Match non-digits + new_raw_html = 'Untitled
' + next_is_img_txt = False + title_started = False + met_article_start_char = False + for item in splitter.split(raw_html): + if item.startswith(u'\u3010'): + met_article_start_char = True + new_raw_html = new_raw_html + '

' + item + '

\n' + else: + if next_is_img_txt == False: + if item.startswith('='): + next_is_img_txt = True + new_raw_html += '

\n' + else: + if met_article_start_char == False: + if title_started == False: + new_raw_html = new_raw_html + '

' + item + '\n' + title_started = True + else: + new_raw_html = new_raw_html + item + '\n' + else: + new_raw_html = new_raw_html + item + '

\n' + else: + next_is_img_txt = False + new_raw_html = new_raw_html + item + '\n' + return new_raw_html + '

' + def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] @@ -593,3 +654,4 @@ class MPRecipe(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) +