Update Ming Pao

This commit is contained in:
Kovid Goyal 2011-02-20 10:34:41 -07:00
parent dba8af1f37
commit 15d1e591ae

View File

@ -1,7 +1,9 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Eddie Lau' __copyright__ = '2010-2011, Eddie Lau'
''' '''
Change Log: Change Log:
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
2010/11/22: add English section, remove eco-news section which is not updated daily, correct 2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre import __appname__
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe): class MPHKRecipe(BasicNewsRecipe):
IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
title = 'Ming Pao - Hong Kong' title = 'Ming Pao - Hong Kong'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'Eddie Lau' __author__ = 'Eddie Lau'
description = 'Hong Kong Chinese Newspaper' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
publisher = 'news.mingpao.com' publisher = 'MingPao'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
@ -46,9 +46,10 @@ class MPHKRecipe(BasicNewsRecipe):
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(attrs={'class':['photo']}),
dict(attrs={'id':['newscontent']}), # entertainment page content dict(attrs={'id':['newscontent']}), # entertainment page content
dict(attrs={'id':['newscontent01','newscontent02']})] dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['photo']})
]
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']})] # for the finance page dict(attrs={'id':['newscontent135']})] # for the finance page
remove_attributes = ['width'] remove_attributes = ['width']
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
def get_fetchdate(self): def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d") return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchday(self): def get_fetchday(self):
# convert UTC to local hk time - at around HKT 6.00am, all news are available # convert UTC to local hk time - at around HKT 6.00am, all news are available
return self.get_dtlocal().strftime("%d") return self.get_dtlocal().strftime("%d")
@ -124,13 +128,13 @@ class MPHKRecipe(BasicNewsRecipe):
feeds = [] feeds = []
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -141,14 +145,10 @@ class MPHKRecipe(BasicNewsRecipe):
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
if fin_articles: if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
# special - eco-friendly
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
# if eco_articles:
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
# special - entertainment # special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles: if ent_articles:
feeds.append((u'\u5f71\u8996 Entertainment', ent_articles)) feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
return feeds return feeds
def parse_section(self, url): def parse_section(self, url):
@ -174,31 +174,17 @@ class MPHKRecipe(BasicNewsRecipe):
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href= True) a = soup.findAll('a', href= True)
current_articles = [] current_articles = []
for i in a:
url = i.get('href', False)
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
title = self.tag_to_string(i)
url = 'http://www.mpfinance.com/cfm/' +url
current_articles.append({'title': title, 'url': url, 'description':''})
return current_articles
def parse_eco_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet']})
current_articles = []
included_urls = [] included_urls = []
for i in divs: for i in a:
a = i.find('a', href = True) url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
title = self.tag_to_string(a) if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
url = a.get('href', False) title = self.tag_to_string(i)
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
current_articles.append({'title': title, 'url': url, 'description':''}) current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url) included_urls.append(url)
return current_articles return current_articles
def parse_ent_section(self, url): def parse_ent_section(self, url):
self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href=True)
a.reverse() a.reverse()
@ -223,18 +209,22 @@ class MPHKRecipe(BasicNewsRecipe):
return soup return soup
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if self.IsKindleUsed == False:
super(MPHKRecipe,self).create_opf(feeds, dir)
return
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
title = self.short_title() if self.IsCJKWellSupported == True:
title += ' ' + self.get_fetchdate() # use Chinese title
#if self.output_profile.periodical_date_in_title: title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
else:
# use English title
title = self.short_title() + ' ' + self.get_fetchformatteddate()
if True: # force date in title
# title += strftime(self.timefmt) # title += strftime(self.timefmt)
mi = MetaInformation(title, [__appname__]) mi = MetaInformation(title, [self.publisher])
mi.publisher = __appname__ mi.publisher = self.publisher
mi.author_sort = __appname__ mi.author_sort = self.publisher
if self.IsCJKWellSupported == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title() mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf() #mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal() mi.timestamp = self.get_dtlocal()
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f), templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed, not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix, a.orig_url, self.publisher, prefix=prefix,
center=self.center_navbar) center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem) body.insert(len(body.contents), elem)
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)