mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Update Ming Pao
This commit is contained in:
parent
dba8af1f37
commit
15d1e591ae
@ -1,7 +1,9 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Eddie Lau'
|
__copyright__ = '2010-2011, Eddie Lau'
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
|
||||||
|
clean up the indentation
|
||||||
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
|
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
|
||||||
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
|
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
|
||||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||||
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
|||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
|
|
||||||
|
|
||||||
from calibre import __appname__
|
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
|
||||||
class MPHKRecipe(BasicNewsRecipe):
|
class MPHKRecipe(BasicNewsRecipe):
|
||||||
IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view
|
IsCJKWellSupported = True # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
|
||||||
|
|
||||||
title = 'Ming Pao - Hong Kong'
|
title = 'Ming Pao - Hong Kong'
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
__author__ = 'Eddie Lau'
|
__author__ = 'Eddie Lau'
|
||||||
description = 'Hong Kong Chinese Newspaper'
|
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||||
publisher = 'news.mingpao.com'
|
publisher = 'MingPao'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
@ -46,19 +46,20 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||||
dict(attrs={'class':['photo']}),
|
|
||||||
dict(attrs={'id':['newscontent']}), # entertainment page content
|
dict(attrs={'id':['newscontent']}), # entertainment page content
|
||||||
dict(attrs={'id':['newscontent01','newscontent02']})]
|
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||||
|
dict(attrs={'class':['photo']})
|
||||||
|
]
|
||||||
remove_tags = [dict(name='style'),
|
remove_tags = [dict(name='style'),
|
||||||
dict(attrs={'id':['newscontent135']})] # for the finance page
|
dict(attrs={'id':['newscontent135']})] # for the finance page
|
||||||
remove_attributes = ['width']
|
remove_attributes = ['width']
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||||
lambda match: '<h1>'),
|
lambda match: '<h1>'),
|
||||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||||
lambda match: '</h1>'),
|
lambda match: '</h1>'),
|
||||||
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
|
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
|
||||||
lambda match: '')
|
lambda match: '')
|
||||||
]
|
]
|
||||||
|
|
||||||
def image_url_processor(cls, baseurl, url):
|
def image_url_processor(cls, baseurl, url):
|
||||||
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
def get_fetchdate(self):
|
def get_fetchdate(self):
|
||||||
return self.get_dtlocal().strftime("%Y%m%d")
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
def get_fetchformatteddate(self):
|
||||||
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
def get_fetchday(self):
|
def get_fetchday(self):
|
||||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||||
return self.get_dtlocal().strftime("%d")
|
return self.get_dtlocal().strftime("%d")
|
||||||
@ -121,84 +125,66 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
return cover
|
return cover
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
dateStr = self.get_fetchdate()
|
dateStr = self.get_fetchdate()
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
|
||||||
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
|
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||||
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
|
||||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
|
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
|
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
|
||||||
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
# special - finance
|
# special - finance
|
||||||
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||||
if fin_articles:
|
if fin_articles:
|
||||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||||
# special - eco-friendly
|
# special - entertainment
|
||||||
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
|
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||||
# if eco_articles:
|
if ent_articles:
|
||||||
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
|
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||||
# special - entertainment
|
return feeds
|
||||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
|
||||||
if ent_articles:
|
|
||||||
feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def parse_section(self, url):
|
def parse_section(self, url):
|
||||||
dateStr = self.get_fetchdate()
|
dateStr = self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||||
current_articles = []
|
current_articles = []
|
||||||
included_urls = []
|
included_urls = []
|
||||||
divs.reverse()
|
divs.reverse()
|
||||||
for i in divs:
|
for i in divs:
|
||||||
a = i.find('a', href = True)
|
a = i.find('a', href = True)
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
def parse_fin_section(self, url):
|
def parse_fin_section(self, url):
|
||||||
dateStr = self.get_fetchdate()
|
dateStr = self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
a = soup.findAll('a', href= True)
|
a = soup.findAll('a', href= True)
|
||||||
current_articles = []
|
current_articles = []
|
||||||
for i in a:
|
|
||||||
url = i.get('href', False)
|
|
||||||
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
|
||||||
title = self.tag_to_string(i)
|
|
||||||
url = 'http://www.mpfinance.com/cfm/' +url
|
|
||||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
|
||||||
return current_articles
|
|
||||||
|
|
||||||
def parse_eco_section(self, url):
|
|
||||||
dateStr = self.get_fetchdate()
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
divs = soup.findAll(attrs={'class': ['bullet']})
|
|
||||||
current_articles = []
|
|
||||||
included_urls = []
|
included_urls = []
|
||||||
for i in divs:
|
for i in a:
|
||||||
a = i.find('a', href = True)
|
url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
|
||||||
title = self.tag_to_string(a)
|
if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||||
url = a.get('href', False)
|
title = self.tag_to_string(i)
|
||||||
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
|
|
||||||
if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
|
|
||||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
def parse_ent_section(self, url):
|
def parse_ent_section(self, url):
|
||||||
|
self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
a = soup.findAll('a', href=True)
|
a = soup.findAll('a', href=True)
|
||||||
a.reverse()
|
a.reverse()
|
||||||
@ -223,67 +209,71 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if self.IsKindleUsed == False:
|
|
||||||
super(MPHKRecipe,self).create_opf(feeds, dir)
|
|
||||||
return
|
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
title = self.short_title()
|
if self.IsCJKWellSupported == True:
|
||||||
title += ' ' + self.get_fetchdate()
|
# use Chinese title
|
||||||
#if self.output_profile.periodical_date_in_title:
|
title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
|
||||||
# title += strftime(self.timefmt)
|
else:
|
||||||
mi = MetaInformation(title, [__appname__])
|
# use English title
|
||||||
mi.publisher = __appname__
|
title = self.short_title() + ' ' + self.get_fetchformatteddate()
|
||||||
mi.author_sort = __appname__
|
if True: # force date in title
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
# title += strftime(self.timefmt)
|
||||||
#mi.timestamp = nowf()
|
mi = MetaInformation(title, [self.publisher])
|
||||||
mi.timestamp = self.get_dtlocal()
|
mi.publisher = self.publisher
|
||||||
mi.comments = self.description
|
mi.author_sort = self.publisher
|
||||||
if not isinstance(mi.comments, unicode):
|
if self.IsCJKWellSupported == True:
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
#mi.pubdate = nowf()
|
else:
|
||||||
mi.pubdate = self.get_dtlocal()
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
#mi.timestamp = nowf()
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
mi.timestamp = self.get_dtlocal()
|
||||||
opf = OPFCreator(dir, mi)
|
mi.comments = self.description
|
||||||
# Add mastheadImage entry to <guide> section
|
if not isinstance(mi.comments, unicode):
|
||||||
mp = getattr(self, 'masthead_path', None)
|
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
if mp is not None and os.access(mp, os.R_OK):
|
#mi.pubdate = nowf()
|
||||||
from calibre.ebooks.metadata.opf2 import Guide
|
mi.pubdate = self.get_dtlocal()
|
||||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
ref.type = 'masthead'
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
ref.title = 'Masthead Image'
|
opf = OPFCreator(dir, mi)
|
||||||
opf.guide.append(ref)
|
# Add mastheadImage entry to <guide> section
|
||||||
|
mp = getattr(self, 'masthead_path', None)
|
||||||
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
manifest.append(os.path.join(dir, 'index.html'))
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
# Get cover
|
# Get cover
|
||||||
cpath = getattr(self, 'cover_path', None)
|
cpath = getattr(self, 'cover_path', None)
|
||||||
if cpath is None:
|
if cpath is None:
|
||||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
if self.default_cover(pf):
|
if self.default_cover(pf):
|
||||||
cpath = pf.name
|
cpath = pf.name
|
||||||
if cpath is not None and os.access(cpath, os.R_OK):
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
opf.cover = cpath
|
opf.cover = cpath
|
||||||
manifest.append(cpath)
|
manifest.append(cpath)
|
||||||
|
|
||||||
# Get masthead
|
# Get masthead
|
||||||
mpath = getattr(self, 'masthead_path', None)
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
if mpath is not None and os.access(mpath, os.R_OK):
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
manifest.append(mpath)
|
manifest.append(mpath)
|
||||||
|
|
||||||
opf.create_manifest_from_files_in(manifest)
|
opf.create_manifest_from_files_in(manifest)
|
||||||
for mani in opf.manifest:
|
for mani in opf.manifest:
|
||||||
if mani.path.endswith('.ncx'):
|
if mani.path.endswith('.ncx'):
|
||||||
mani.id = 'ncx'
|
mani.id = 'ncx'
|
||||||
if mani.path.endswith('mastheadImage.jpg'):
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
mani.id = 'masthead-image'
|
mani.id = 'masthead-image'
|
||||||
entries = ['index.html']
|
entries = ['index.html']
|
||||||
toc = TOC(base_path=dir)
|
toc = TOC(base_path=dir)
|
||||||
self.play_order_counter = 0
|
self.play_order_counter = 0
|
||||||
self.play_order_map = {}
|
self.play_order_map = {}
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
templ = self.navbar.generate(True, num, j, len(f),
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
not self.has_single_feed,
|
not self.has_single_feed,
|
||||||
a.orig_url, __appname__, prefix=prefix,
|
a.orig_url, self.publisher, prefix=prefix,
|
||||||
center=self.center_navbar)
|
center=self.center_navbar)
|
||||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
body.insert(len(body.contents), elem)
|
body.insert(len(body.contents), elem)
|
||||||
@ -344,7 +334,7 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
if not desc:
|
if not desc:
|
||||||
desc = None
|
desc = None
|
||||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
f.title, play_order=po, description=desc, author=auth))
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
entries.append('feed_%d/index.html'%0)
|
entries.append('feed_%d/index.html'%0)
|
||||||
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user