Update Ming Pao

This commit is contained in:
Kovid Goyal 2011-02-20 10:34:41 -07:00
parent dba8af1f37
commit 15d1e591ae

View File

@ -1,7 +1,9 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Eddie Lau'
__copyright__ = '2010-2011, Eddie Lau'
'''
Change Log:
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre import __appname__
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe):
IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view
IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
title = 'Ming Pao - Hong Kong'
oldest_article = 1
max_articles_per_feed = 100
__author__ = 'Eddie Lau'
description = 'Hong Kong Chinese Newspaper'
publisher = 'news.mingpao.com'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
publisher = 'MingPao'
category = 'Chinese, News, Hong Kong'
remove_javascript = True
use_embedded_content = False
@ -46,9 +46,10 @@ class MPHKRecipe(BasicNewsRecipe):
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(attrs={'class':['photo']}),
dict(attrs={'id':['newscontent']}), # entertainment page content
dict(attrs={'id':['newscontent01','newscontent02']})]
dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['photo']})
]
remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']})] # for the finance page
remove_attributes = ['width']
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchday(self):
# convert UTC to local hk time - at around HKT 6.00am, all news are available
return self.get_dtlocal().strftime("%d")
@ -124,13 +128,13 @@ class MPHKRecipe(BasicNewsRecipe):
feeds = []
dateStr = self.get_fetchdate()
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -141,14 +145,10 @@ class MPHKRecipe(BasicNewsRecipe):
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
# special - eco-friendly
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
# if eco_articles:
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
# special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles:
feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
return feeds
def parse_section(self, url):
@ -174,31 +174,17 @@ class MPHKRecipe(BasicNewsRecipe):
soup = self.index_to_soup(url)
a = soup.findAll('a', href= True)
current_articles = []
for i in a:
url = i.get('href', False)
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
title = self.tag_to_string(i)
url = 'http://www.mpfinance.com/cfm/' +url
current_articles.append({'title': title, 'url': url, 'description':''})
return current_articles
def parse_eco_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet']})
current_articles = []
included_urls = []
for i in divs:
a = i.find('a', href = True)
title = self.tag_to_string(a)
url = a.get('href', False)
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
for i in a:
url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
title = self.tag_to_string(i)
current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url)
return current_articles
def parse_ent_section(self, url):
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
@ -223,18 +209,22 @@ class MPHKRecipe(BasicNewsRecipe):
return soup
def create_opf(self, feeds, dir=None):
if self.IsKindleUsed == False:
super(MPHKRecipe,self).create_opf(feeds, dir)
return
if dir is None:
dir = self.output_dir
title = self.short_title()
title += ' ' + self.get_fetchdate()
#if self.output_profile.periodical_date_in_title:
if self.IsCJKWellSupported == True:
# use Chinese title
title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
else:
# use English title
title = self.short_title() + ' ' + self.get_fetchformatteddate()
if True: # force date in title
# title += strftime(self.timefmt)
mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
mi = MetaInformation(title, [self.publisher])
mi.publisher = self.publisher
mi.author_sort = self.publisher
if self.IsCJKWellSupported == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal()
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
a.orig_url, self.publisher, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)