Updated Ming Pao

This commit is contained in:
Kovid Goyal 2011-03-08 18:54:37 -07:00
parent 14717a5e92
commit a4b50102ff

View File

@ -1,7 +1,20 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010-2011, Eddie Lau' __copyright__ = '2010-2011, Eddie Lau'
# Users of Kindle 3 (with limited system-level CJK support)
# please replace the following "True" with "False".
__MakePeriodical__ = True
# Turn it to True if your device supports display of CJK titles
__UseChineseTitle__ = False
''' '''
Change Log: Change Log:
2011/03/06: add new articles for finance section, also a new section "Columns"
2011/02/28: rearrange the sections
[Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
folder in Kindle 3
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
clean up the indentation clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
@ -19,21 +32,17 @@ import os, datetime, re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe): class MPHKRecipe(BasicNewsRecipe):
IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
title = 'Ming Pao - Hong Kong' title = 'Ming Pao - Hong Kong'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'Eddie Lau' __author__ = 'Eddie Lau'
description = ('Hong Kong Chinese Newspaper (http://news.mingpao.com). If' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
'you are using a Kindle with firmware < 3.1, customize the'
'recipe')
publisher = 'MingPao' publisher = 'MingPao'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
remove_javascript = True remove_javascript = True
@ -48,12 +57,14 @@ class MPHKRecipe(BasicNewsRecipe):
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(attrs={'id':['newscontent']}), # entertainment page content dict(name='font', attrs={'color':['AA0000']}), # for column articles title
dict(attrs={'id':['newscontent']}), # entertainment and column page content
dict(attrs={'id':['newscontent01','newscontent02']}), dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['photo']}) dict(attrs={'class':['photo']})
] ]
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']})] # for the finance page dict(attrs={'id':['newscontent135']}), # for the finance page
dict(name='table')] # for content fetched from life.mingpao.com
remove_attributes = ['width'] remove_attributes = ['width']
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE), (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
@ -61,7 +72,12 @@ class MPHKRecipe(BasicNewsRecipe):
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE), (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
lambda match: '</h1>'), lambda match: '</h1>'),
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
lambda match: '') lambda match: ''),
# skip <br> after title in life.mingpao.com fetched article
(re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
lambda match: "<div id='newscontent'>"),
(re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
lambda match: "</b>")
] ]
def image_url_processor(cls, baseurl, url): def image_url_processor(cls, baseurl, url):
@ -129,28 +145,55 @@ class MPHKRecipe(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
# special- editorial
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
if ed_articles:
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
# special - finance # special - finance
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
if fin_articles: if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
# special - entertainment # special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles: if ent_articles:
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
# special- columns
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
if col_articles:
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
return feeds return feeds
def parse_section(self, url): def parse_section(self, url):
@ -171,15 +214,33 @@ class MPHKRecipe(BasicNewsRecipe):
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
def parse_ed_section(self, url):
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
current_articles = []
included_urls = []
for i in a:
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
current_articles.reverse()
return current_articles
def parse_fin_section(self, url): def parse_fin_section(self, url):
dateStr = self.get_fetchdate() self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href= True) a = soup.findAll('a', href= True)
current_articles = [] current_articles = []
included_urls = [] included_urls = []
for i in a: for i in a:
url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
title = self.tag_to_string(i) title = self.tag_to_string(i)
current_articles.append({'title': title, 'url': url, 'description':''}) current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url) included_urls.append(url)
@ -201,6 +262,22 @@ class MPHKRecipe(BasicNewsRecipe):
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
def parse_col_section(self, url):
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
current_articles = []
included_urls = []
for i in a:
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
current_articles.reverse()
return current_articles
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -213,18 +290,18 @@ class MPHKRecipe(BasicNewsRecipe):
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
if self.IsCJKWellSupported == True: if __UseChineseTitle__ == True:
# use Chinese title title = u'\u660e\u5831 (\u9999\u6e2f)'
title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
else: else:
# use English title title = self.short_title()
title = self.short_title() + ' ' + self.get_fetchformatteddate() # if not generating a periodical, force date to apply in title
if True: # force date in title if __MakePeriodical__ == False:
# title += strftime(self.timefmt) title = title + ' ' + self.get_fetchformatteddate()
if True:
mi = MetaInformation(title, [self.publisher]) mi = MetaInformation(title, [self.publisher])
mi.publisher = self.publisher mi.publisher = self.publisher
mi.author_sort = self.publisher mi.author_sort = self.publisher
if self.IsCJKWellSupported == True: if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else: else:
mi.publication_type = self.publication_type+':'+self.short_title() mi.publication_type = self.publication_type+':'+self.short_title()