mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Ming Pao
This commit is contained in:
parent
dacb687cc9
commit
1fad05e090
@ -10,30 +10,31 @@ __MakePeriodical__ = True
|
|||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images (Default: True)
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
# Set it to True if you want to include a summary in Kindle's article view (Default: True)
|
||||||
__IncludeSummary__ = False
|
__IncludeSummary__ = True
|
||||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
__IncludeThumbnails__ = True
|
__IncludeThumbnails__ = True
|
||||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
# (HK only) It is to disable premium content (Default: False)
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
__InclPremium__ = False
|
__InclPremium__ = False
|
||||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: False)
|
# (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with their printer-friendly formats (Default: False)
|
||||||
__ParsePF__ = False
|
__ParsePF__ = False
|
||||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with text formats (Default: True) -- override __ParsePF__
|
# (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with text formats (Default: True) -- override __ParsePF__
|
||||||
__ParseTxt__ = True
|
__ParseTxt__ = True
|
||||||
# (HK only) Use mobile text version for some articles (Default: False)
|
# (HK only) Use mobile text version for some articles (Default: False)
|
||||||
__ParseSelectedMobile__ = False
|
__ParseSelectedMobile__ = False
|
||||||
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
# (HK only) Turn below to True if you wish hi-res images (Default: True)
|
||||||
__HiResImg__ = False
|
__HiResImg__ = True
|
||||||
# Override the date returned by the program if specifying a YYYYMMDD below (not work if __ParseSelectedMobile__ is True and __UseLife__ is False)
|
# Override the date returned by the program if specifying a YYYYMMDD below (not work if __ParseSelectedMobile__ is True and __UseLife__ is False)
|
||||||
__Date__ = ''
|
__Date__ = ''
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2014/10/19: update urls of some web location and top logo
|
||||||
2013/09/28: allow thumbnails even with hi-res images
|
2013/09/28: allow thumbnails even with hi-res images
|
||||||
2012/04/24: improved parsing of news.mingpao.com content
|
2012/04/24: improved parsing of news1.mingpao.com content
|
||||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||||
@ -83,10 +84,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||||
else:
|
else:
|
||||||
title = 'Ming Pao - Hong Kong'
|
title = 'Ming Pao - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://news1.mingpao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/mingpaonews_logo.png'
|
||||||
remove_tags_before = dict(name='font', attrs={'color':['navy']})
|
remove_tags_before = dict(name='font', attrs={'color':['navy']})
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||||
@ -248,7 +249,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
cover = 'http://news1.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
|
cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
|
||||||
elif __Region__ == 'Toronto':
|
elif __Region__ == 'Toronto':
|
||||||
@ -292,8 +293,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
# if articles:
|
# if articles:
|
||||||
# feeds.append((title, articles))
|
# feeds.append((title, articles))
|
||||||
#
|
#
|
||||||
# for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
# for title, url in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
# (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
# (u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
# articles = self.parse_section(url)
|
# articles = self.parse_section(url)
|
||||||
# if articles:
|
# if articles:
|
||||||
# feeds.append((title, articles))
|
# feeds.append((title, articles))
|
||||||
@ -308,8 +309,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
if __InclPremium__ == False or self.get_weekday() <> 6:
|
if self.get_weekday() <> 6:
|
||||||
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
@ -322,15 +323,15 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if articles:
|
if articles:
|
||||||
feeds.append((u'\u526f\u520a Supplement', articles))
|
feeds.append((u'\u526f\u520a Supplement', articles))
|
||||||
else:
|
else:
|
||||||
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
articles = self.parse_section_txt(url, seckey)
|
articles = self.parse_section_txt(url, seckey)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
|
for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
@ -339,10 +340,10 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
# end of new
|
# end of new
|
||||||
else:
|
else:
|
||||||
for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'),
|
for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news1.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'),
|
||||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm', 'gb'),
|
(u'\u6e2f\u805e Local', 'http://news1.mingpao.com/' + dateStr + '/gbindex.htm', 'gb'),
|
||||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm', 'gf'),
|
(u'\u6559\u80b2 Education', 'http://news1.mingpao.com/' + dateStr + '/gfindex.htm', 'gf'),
|
||||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]:
|
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news1.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
@ -355,9 +356,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
#if ed_articles:
|
#if ed_articles:
|
||||||
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||||
|
|
||||||
for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm', 'fa'),
|
for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news1.mingpao.com/' + dateStr + '/faindex.htm', 'fa'),
|
||||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm', 'ca'),
|
(u'\u4e2d\u570b China', 'http://news1.mingpao.com/' + dateStr + '/caindex.htm', 'ca'),
|
||||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]:
|
(u'\u570b\u969b World', 'http://news1.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
@ -376,8 +377,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
#for title, url in [('Tech News', 'http://news1.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
# (u'\u9ad4\u80b2 Sport', 'http://news1.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||||
# articles = self.parse_section(url)
|
# articles = self.parse_section(url)
|
||||||
# if articles:
|
# if articles:
|
||||||
# feeds.append((title, articles))
|
# feeds.append((title, articles))
|
||||||
@ -404,7 +405,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
if __InclPremium__ == False or self.get_weekday() <> 6:
|
if __InclPremium__ == False or self.get_weekday() <> 6:
|
||||||
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
@ -417,7 +418,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if articles:
|
if articles:
|
||||||
feeds.append((u'\u526f\u520a Supplement', articles))
|
feeds.append((u'\u526f\u520a Supplement', articles))
|
||||||
else:
|
else:
|
||||||
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
@ -425,7 +426,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
|
for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
|
||||||
if __ParseTxt__ == False:
|
if __ParseTxt__ == False:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
else:
|
else:
|
||||||
@ -463,7 +464,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
# parse from news.mingpao.com (web html)
|
# parse from news1.mingpao.com (web html)
|
||||||
def parse_section(self, url):
|
def parse_section(self, url):
|
||||||
dateStr = self.get_fetchdate()
|
dateStr = self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
@ -475,7 +476,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
a = i.find('a', href = True)
|
a = i.find('a', href = True)
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news1.mingpao.com/' + dateStr + '/' +url
|
||||||
# replace the url to the alternative version
|
# replace the url to the alternative version
|
||||||
if __ParsePF__ == True:
|
if __ParsePF__ == True:
|
||||||
# printer-friendly option
|
# printer-friendly option
|
||||||
@ -495,7 +496,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
# parse from news.mingpao.com (txt)
|
# parse from news1.mingpao.com (txt)
|
||||||
def parse_section_txt(self, url, ch):
|
def parse_section_txt(self, url, ch):
|
||||||
dateStr = self.get_fetchdate()
|
dateStr = self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
@ -511,7 +512,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
# replace the url to the alternative version
|
# replace the url to the alternative version
|
||||||
# text version
|
# text version
|
||||||
if url.rfind('Redirect') <> -1:
|
if url.rfind('Redirect') <> -1:
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news1.mingpao.com/' + dateStr + '/' +url
|
||||||
#print 'original url: ', url
|
#print 'original url: ', url
|
||||||
url = re.sub(dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url)
|
url = re.sub(dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url)
|
||||||
url = re.sub('%2F', '/', url)
|
url = re.sub('%2F', '/', url)
|
||||||
@ -523,8 +524,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
# get the first two char in url as ch
|
# get the first two char in url as ch
|
||||||
seckey = url[0:2]
|
seckey = url[0:2]
|
||||||
url = url.replace('.htm', '.txt')
|
url = url.replace('.htm', '.txt')
|
||||||
url = 'http://news.mingpao.com/ftp/WebNews2/' + dateStr + '/' + ch + '/' + seckey + '/' + url
|
url = 'http://news1.mingpao.com/ftp/WebNews2/' + dateStr + '/' + ch + '/' + seckey + '/' + url
|
||||||
#print 'updated url: ', url
|
#print 'updated url: ', url
|
||||||
|
|
||||||
if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
|
if url not in included_urls and (__InclPremium__ == True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
|
||||||
#if url not in included_urls and (url.rfind('Redirect') == -1) and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
|
#if url not in included_urls and (url.rfind('Redirect') == -1) and (__InclPremium__ == False or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
|
||||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||||
@ -772,7 +774,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||||
if __HiResImg__ == True:
|
if __HiResImg__ == True:
|
||||||
# TODO: add a _ in front of an image url
|
# TODO: add a _ in front of an image url
|
||||||
if url.rfind('news.mingpao.com') > -1:
|
if url.rfind('news1.mingpao.com') > -1:
|
||||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
br = mechanize.Browser()
|
br = mechanize.Browser()
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
@ -1072,3 +1074,4 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user