Update Ming Pao

This commit is contained in:
Kovid Goyal 2011-02-20 10:34:41 -07:00
parent dba8af1f37
commit 15d1e591ae

View File

@ -1,7 +1,9 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Eddie Lau' __copyright__ = '2010-2011, Eddie Lau'
''' '''
Change Log: Change Log:
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
2010/11/22: add English section, remove eco-news section which is not updated daily, correct 2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre import __appname__
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
class MPHKRecipe(BasicNewsRecipe): class MPHKRecipe(BasicNewsRecipe):
IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
title = 'Ming Pao - Hong Kong' title = 'Ming Pao - Hong Kong'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'Eddie Lau' __author__ = 'Eddie Lau'
description = 'Hong Kong Chinese Newspaper' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
publisher = 'news.mingpao.com' publisher = 'MingPao'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
@ -46,19 +46,20 @@ class MPHKRecipe(BasicNewsRecipe):
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(attrs={'class':['photo']}),
dict(attrs={'id':['newscontent']}), # entertainment page content dict(attrs={'id':['newscontent']}), # entertainment page content
dict(attrs={'id':['newscontent01','newscontent02']})] dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['photo']})
]
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
dict(attrs={'id':['newscontent135']})] # for the finance page dict(attrs={'id':['newscontent135']})] # for the finance page
remove_attributes = ['width'] remove_attributes = ['width']
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE), (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
lambda match: '<h1>'), lambda match: '<h1>'),
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE), (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
lambda match: '</h1>'), lambda match: '</h1>'),
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
lambda match: '') lambda match: '')
] ]
def image_url_processor(cls, baseurl, url): def image_url_processor(cls, baseurl, url):
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
def get_fetchdate(self): def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d") return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchday(self): def get_fetchday(self):
# convert UTC to local hk time - at around HKT 6.00am, all news are available # convert UTC to local hk time - at around HKT 6.00am, all news are available
return self.get_dtlocal().strftime("%d") return self.get_dtlocal().strftime("%d")
@ -121,84 +125,66 @@ class MPHKRecipe(BasicNewsRecipe):
return cover return cover
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
# special - finance # special - finance
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
if fin_articles: if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
# special - eco-friendly # special - entertainment
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm') ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
# if eco_articles: if ent_articles:
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles)) feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
# special - entertainment return feeds
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles:
feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
return feeds
def parse_section(self, url): def parse_section(self, url):
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
current_articles = [] current_articles = []
included_urls = [] included_urls = []
divs.reverse() divs.reverse()
for i in divs: for i in divs:
a = i.find('a', href = True) a = i.find('a', href = True)
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a.get('href', False) url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url url = 'http://news.mingpao.com/' + dateStr + '/' +url
if url not in included_urls and url.rfind('Redirect') == -1: if url not in included_urls and url.rfind('Redirect') == -1:
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
def parse_fin_section(self, url): def parse_fin_section(self, url):
dateStr = self.get_fetchdate() dateStr = self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href= True) a = soup.findAll('a', href= True)
current_articles = [] current_articles = []
for i in a:
url = i.get('href', False)
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
title = self.tag_to_string(i)
url = 'http://www.mpfinance.com/cfm/' +url
current_articles.append({'title': title, 'url': url, 'description':''})
return current_articles
def parse_eco_section(self, url):
dateStr = self.get_fetchdate()
soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet']})
current_articles = []
included_urls = [] included_urls = []
for i in divs: for i in a:
a = i.find('a', href = True) url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
title = self.tag_to_string(a) if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
url = a.get('href', False) title = self.tag_to_string(i)
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
current_articles.append({'title': title, 'url': url, 'description':''}) current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url) included_urls.append(url)
return current_articles return current_articles
def parse_ent_section(self, url): def parse_ent_section(self, url):
self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href=True)
a.reverse() a.reverse()
@ -223,67 +209,71 @@ class MPHKRecipe(BasicNewsRecipe):
return soup return soup
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if self.IsKindleUsed == False:
super(MPHKRecipe,self).create_opf(feeds, dir)
return
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
title = self.short_title() if self.IsCJKWellSupported == True:
title += ' ' + self.get_fetchdate() # use Chinese title
#if self.output_profile.periodical_date_in_title: title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
# title += strftime(self.timefmt) else:
mi = MetaInformation(title, [__appname__]) # use English title
mi.publisher = __appname__ title = self.short_title() + ' ' + self.get_fetchformatteddate()
mi.author_sort = __appname__ if True: # force date in title
mi.publication_type = self.publication_type+':'+self.short_title() # title += strftime(self.timefmt)
#mi.timestamp = nowf() mi = MetaInformation(title, [self.publisher])
mi.timestamp = self.get_dtlocal() mi.publisher = self.publisher
mi.comments = self.description mi.author_sort = self.publisher
if not isinstance(mi.comments, unicode): if self.IsCJKWellSupported == True:
mi.comments = mi.comments.decode('utf-8', 'replace') mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
#mi.pubdate = nowf() else:
mi.pubdate = self.get_dtlocal() mi.publication_type = self.publication_type+':'+self.short_title()
opf_path = os.path.join(dir, 'index.opf') #mi.timestamp = nowf()
ncx_path = os.path.join(dir, 'index.ncx') mi.timestamp = self.get_dtlocal()
opf = OPFCreator(dir, mi) mi.comments = self.description
# Add mastheadImage entry to <guide> section if not isinstance(mi.comments, unicode):
mp = getattr(self, 'masthead_path', None) mi.comments = mi.comments.decode('utf-8', 'replace')
if mp is not None and os.access(mp, os.R_OK): #mi.pubdate = nowf()
from calibre.ebooks.metadata.opf2 import Guide mi.pubdate = self.get_dtlocal()
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) opf_path = os.path.join(dir, 'index.opf')
ref.type = 'masthead' ncx_path = os.path.join(dir, 'index.ncx')
ref.title = 'Masthead Image' opf = OPFCreator(dir, mi)
opf.guide.append(ref) # Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx')) manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover # Get cover
cpath = getattr(self, 'cover_path', None) cpath = getattr(self, 'cover_path', None)
if cpath is None: if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb') pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf): if self.default_cover(pf):
cpath = pf.name cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK): if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath opf.cover = cpath
manifest.append(cpath) manifest.append(cpath)
# Get masthead # Get masthead
mpath = getattr(self, 'masthead_path', None) mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK): if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath) manifest.append(mpath)
opf.create_manifest_from_files_in(manifest) opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest: for mani in opf.manifest:
if mani.path.endswith('.ncx'): if mani.path.endswith('.ncx'):
mani.id = 'ncx' mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'): if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image' mani.id = 'masthead-image'
entries = ['index.html'] entries = ['index.html']
toc = TOC(base_path=dir) toc = TOC(base_path=dir)
self.play_order_counter = 0 self.play_order_counter = 0
self.play_order_map = {} self.play_order_map = {}
def feed_index(num, parent): def feed_index(num, parent):
f = feeds[num] f = feeds[num]
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f), templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed, not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix, a.orig_url, self.publisher, prefix=prefix,
center=self.center_navbar) center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem) body.insert(len(body.contents), elem)
@ -344,7 +334,7 @@ class MPHKRecipe(BasicNewsRecipe):
if not desc: if not desc:
desc = None desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth)) f.title, play_order=po, description=desc, author=auth))
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)