diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index 9408d6c7d0..856d7166ff 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -6,19 +6,24 @@ __Region__ = 'Hong Kong' # Users of Kindle 3 with limited system-level CJK support # please replace the following "True" with "False". __MakePeriodical__ = True -# Turn below to true if your device supports display of CJK titles +# Turn below to True if your device supports display of CJK titles __UseChineseTitle__ = False # Set it to False if you want to skip images __KeepImages__ = True -# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source +# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source __UseLife__ = True -# (HK only) if __UseLife__ is true, turn this on if you want to include the column section +# (HK only) It is to disable the column section which is now a premium content __InclCols__ = False +# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats +__ParsePFF__ = False +# (HK only) Turn below to True if you wish hi-res images +__HiResImg__ = False ''' Change Log: -2011/09/21: fetching "column" section is made optional. Default is False +2011/10/04: option to get hi-res photos for the articles +2011/09/21: fetching "column" section is made optional. 2011/09/18: parse "column" section stuff from source text file directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source @@ -42,7 +47,7 @@ Change Log: 2010/10/31: skip repeated articles in section pages ''' -import os, datetime, re +import os, datetime, re, mechanize from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -56,7 +61,7 @@ class MPRecipe(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title @@ -147,43 +152,6 @@ class MPRecipe(BasicNewsRecipe): conversion_options = {'linearize_tables':True} timefmt = '' - def image_url_processor(cls, baseurl, url): - # trick: break the url at the first occurance of digit, add an additional - # '_' at the front - # not working, may need to move this to preprocess_html() method -# minIdx = 10000 -# i0 = url.find('0') -# if i0 >= 0 and i0 < minIdx: -# minIdx = i0 -# i1 = url.find('1') -# if i1 >= 0 and i1 < minIdx: -# minIdx = i1 -# i2 = url.find('2') -# if i2 >= 0 and i2 < minIdx: -# minIdx = i2 -# i3 = url.find('3') -# if i3 >= 0 and i0 < minIdx: -# minIdx = i3 -# i4 = url.find('4') -# if i4 >= 0 and i4 < minIdx: -# minIdx = i4 -# i5 = url.find('5') -# if i5 >= 0 and i5 < minIdx: -# minIdx = i5 -# i6 = url.find('6') -# if i6 >= 0 and i6 < minIdx: -# minIdx = i6 -# i7 = url.find('7') -# if i7 >= 0 and i7 < minIdx: -# minIdx = i7 -# i8 = url.find('8') -# if i8 >= 0 and i8 < minIdx: -# minIdx = i8 -# i9 = url.find('9') -# if i9 >= 0 and i9 < minIdx: -# minIdx = i9 - return url - def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() if __Region__ == 'Hong Kong': @@ -260,15 +228,16 @@ class MPRecipe(BasicNewsRecipe): else: for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), - (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) # special- editorial - ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') - if ed_articles: - feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) + #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') + #if ed_articles: + # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), @@ -279,20 +248,39 @@ class MPRecipe(BasicNewsRecipe): # special - finance #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') - if fin_articles: - feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') + #if fin_articles: + # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: - articles = self.parse_section(url) + for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: + articles = self.parse_section2(url, keystr) if articles: feeds.append((title, articles)) + #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: + # articles = self.parse_section(url) + # if articles: + # feeds.append((title, articles)) + # special - entertainment - ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - if ent_articles: - feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + #if ent_articles: + # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + + for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') + ]: + articles = self.parse_section2(url, keystr) + if articles: + feeds.append((title, articles)) + + if __InclCols__ == True: + # parse column section articles directly from .txt files + for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') + ]: + articles = self.parse_section2_txt(url, keystr) + if articles: + feeds.append((title, articles)) for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: @@ -300,11 +288,6 @@ class MPRecipe(BasicNewsRecipe): if articles: feeds.append((title, articles)) - - # special- columns - col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn') - if col_articles: - feeds.append((u'\u5c08\u6b04 Columns', col_articles)) elif __Region__ == 'Vancouver': for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), @@ -348,6 +331,16 @@ class MPRecipe(BasicNewsRecipe): title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url + # replace the url to the print-friendly version + if __ParsePFF__ == True: + if url.rfind('Redirect') <> -1: + url = re.sub(dateStr + '.*' + dateStr, dateStr, url) + url = re.sub('%2F.*%2F', '/', url) + title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') + url = url.replace('%2Etxt', '_print.htm') + url = url.replace('%5F', '_') + else: + url = url.replace('.htm', '_print.htm') if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) included_urls.append(url) @@ -472,38 +465,119 @@ class MPRecipe(BasicNewsRecipe): current_articles.reverse() return current_articles - # preprocess those .txt based files + # preprocess those .txt and javascript based files def preprocess_raw_html(self, raw_html, url): - if url.rfind('ftp') == -1: + #raw_html = raw_html.replace(u'

\u3010', u'\u3010') + if __HiResImg__ == True: + # TODO: add a _ in front of an image url + if url.rfind('news.mingpao.com') > -1: + imglist = re.findall('src="?.*?jpg"', raw_html) + br = mechanize.Browser() + br.set_handle_redirect(False) + for img in imglist: + gifimg = img.replace('jpg"', 'gif"') + try: + br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) + raw_html = raw_html.replace(img, gifimg) + except: + # find the location of the first _ + pos = img.find('_') + if pos > -1: + # if found, insert _ after the first _ + newimg = img[0:pos] + '_' + img[pos:] + raw_html = raw_html.replace(img, newimg) + else: + # if not found, insert _ after " + raw_html = raw_html.replace(img[1:], '"_' + img[1:]) + elif url.rfind('life.mingpao.com') > -1: + imglist = re.findall('src=\'?.*?jpg\'', raw_html) + br = mechanize.Browser() + br.set_handle_redirect(False) + #print 'Img list: ', imglist, '\n' + for img in imglist: + gifimg = img.replace('jpg\'', 'gif\'') + try: + #print 'Original: ', url + #print 'To append: ', "/../" + gifimg[5:len(gifimg)-1] + gifurl = re.sub(r'dailynews.*txt', '', url) + #print 'newurl: ', gifurl + gifimg[5:len(gifimg)-1] + br.open_novisit(gifurl + gifimg[5:len(gifimg)-1]) + #print 'URL: ', url + "/../" + gifimg[5:len(gifimg)-1] + #br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) + raw_html = raw_html.replace(img, gifimg) + except: + #print 'GIF not found' + pos = img.rfind('/') + newimg = img[0:pos+1] + '_' + img[pos+1:] + #print 'newimg: ', newimg + raw_html = raw_html.replace(img, newimg) + if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1: return raw_html else: - splitter = re.compile(r'\n') # Match non-digits - new_raw_html = 'Untitled

' - next_is_img_txt = False - title_started = False - met_article_start_char = False - for item in splitter.split(raw_html): - if item.startswith(u'\u3010'): - met_article_start_char = True - new_raw_html = new_raw_html + '

' + item + '

\n' - else: - if next_is_img_txt == False: - if item.startswith('='): - next_is_img_txt = True - new_raw_html += '

\n' + if url.rfind('_print.htm') <> -1: + # javascript based file + splitter = re.compile(r'\n') + new_raw_html = 'Untitled' + new_raw_html = new_raw_html + '' + for item in splitter.split(raw_html): + if item.startswith('var heading1 ='): + heading = item.replace('var heading1 = \'', '') + heading = heading.replace('\'', '') + heading = heading.replace(';', '') + new_raw_html = new_raw_html + '

' + heading + if item.startswith('var heading2 ='): + heading = item.replace('var heading2 = \'', '') + heading = heading.replace('\'', '') + heading = heading.replace(';', '') + if heading <> '': + new_raw_html = new_raw_html + '
' + heading + '
' else: - if met_article_start_char == False: - if title_started == False: - new_raw_html = new_raw_html + '
' + item + '\n' - title_started = True - else: - new_raw_html = new_raw_html + item + '\n' - else: - new_raw_html = new_raw_html + item + '

\n' + new_raw_html = new_raw_html + '

' + if item.startswith('var content ='): + content = item.replace("var content = ", '') + content = content.replace('\'', '') + content = content.replace(';', '') + new_raw_html = new_raw_html + '
' + content + '
' + if item.startswith('var photocontent ='): + photo = item.replace('var photocontent = \'', '') + photo = photo.replace('\'', '') + photo = photo.replace(';', '') + photo = photo.replace('', '') + photo = photo.replace('', '') + photo = photo.replace('', '') + photo = photo.replace('', '
') + photo = photo.replace('class="photo"', '') + new_raw_html = new_raw_html + '
' + photo + '
' + return new_raw_html + '' + else: + # .txt based file + splitter = re.compile(r'\n') # Match non-digits + new_raw_html = 'Untitled
' + next_is_img_txt = False + title_started = False + met_article_start_char = False + for item in splitter.split(raw_html): + if item.startswith(u'\u3010'): + met_article_start_char = True + new_raw_html = new_raw_html + '

' + item + '

\n' else: - next_is_img_txt = False - new_raw_html = new_raw_html + item + '\n' - return new_raw_html + '

' + if next_is_img_txt == False: + if item.startswith('='): + next_is_img_txt = True + new_raw_html += '

\n' + else: + if met_article_start_char == False: + if title_started == False: + new_raw_html = new_raw_html + '

' + item + '\n' + title_started = True + else: + new_raw_html = new_raw_html + item + '\n' + else: + new_raw_html = new_raw_html + item + '

\n' + else: + next_is_img_txt = False + new_raw_html = new_raw_html + item + '\n' + return new_raw_html + '

' def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -604,7 +678,7 @@ class MPRecipe(BasicNewsRecipe): if po is None: self.play_order_counter += 1 po = self.play_order_counter - parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'), play_order=po, author=auth, description=desc) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) for sp in a.sub_pages: