mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Ming Pao updated
This commit is contained in:
parent
548be9fd6b
commit
ecbdbbb006
@ -1,8 +1,9 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Eddie Lau'
|
__copyright__ = '2010, Eddie Lau'
|
||||||
'''
|
'''
|
||||||
modified from Singtao Toronto calibre recipe by rty
|
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
|
||||||
|
(to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
|
||||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||||
ordering of articles
|
ordering of articles
|
||||||
2010/11/12: add news image and eco-news section
|
2010/11/12: add news image and eco-news section
|
||||||
@ -17,14 +18,15 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
|||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
|
|
||||||
|
|
||||||
from calibre import __appname__, strftime
|
from calibre import __appname__
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre.utils.date import now as nowf
|
|
||||||
|
|
||||||
class MPHKRecipe(BasicNewsRecipe):
|
class MPHKRecipe(BasicNewsRecipe):
|
||||||
|
IsKindleUsed = True # to avoid generating periodical in which CJK characters can't be displayed in section/article view
|
||||||
|
|
||||||
title = 'Ming Pao - Hong Kong'
|
title = 'Ming Pao - Hong Kong'
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -39,13 +41,13 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
encoding = 'Big5-HKSCS'
|
encoding = 'Big5-HKSCS'
|
||||||
recursions = 0
|
recursions = 0
|
||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}'
|
timefmt = ''
|
||||||
#extra_css = 'img {float:right; margin:4px;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
#dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page
|
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||||
dict(attrs={'class':['photo']}),
|
dict(attrs={'class':['photo']}),
|
||||||
dict(attrs={'id':['newscontent']}),
|
dict(attrs={'id':['newscontent']}), # entertainment page content
|
||||||
dict(attrs={'id':['newscontent01','newscontent02']})]
|
dict(attrs={'id':['newscontent01','newscontent02']})]
|
||||||
remove_tags = [dict(name='style'),
|
remove_tags = [dict(name='style'),
|
||||||
dict(attrs={'id':['newscontent135']})] # for the finance page
|
dict(attrs={'id':['newscontent135']})] # for the finance page
|
||||||
@ -55,51 +57,68 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
lambda match: '<h1>'),
|
lambda match: '<h1>'),
|
||||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||||
lambda match: '</h1>'),
|
lambda match: '</h1>'),
|
||||||
|
(re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
|
||||||
|
lambda match: '')
|
||||||
]
|
]
|
||||||
|
|
||||||
def image_url_processor(cls, baseurl, url):
|
def image_url_processor(cls, baseurl, url):
|
||||||
# trick: break the url at the first occurance of digit, add an additional
|
# trick: break the url at the first occurance of digit, add an additional
|
||||||
# '_' at the front
|
# '_' at the front
|
||||||
# not working, may need to move this to preprocess_html() method
|
# not working, may need to move this to preprocess_html() method
|
||||||
#minIdx = 10000
|
# minIdx = 10000
|
||||||
#i0 = url.find('0')
|
# i0 = url.find('0')
|
||||||
#if i0 >= 0 and i0 < minIdx:
|
# if i0 >= 0 and i0 < minIdx:
|
||||||
# minIdx = i0
|
# minIdx = i0
|
||||||
#i1 = url.find('1')
|
# i1 = url.find('1')
|
||||||
#if i1 >= 0 and i1 < minIdx:
|
# if i1 >= 0 and i1 < minIdx:
|
||||||
# minIdx = i1
|
# minIdx = i1
|
||||||
#i2 = url.find('2')
|
# i2 = url.find('2')
|
||||||
#if i2 >= 0 and i2 < minIdx:
|
# if i2 >= 0 and i2 < minIdx:
|
||||||
# minIdx = i2
|
# minIdx = i2
|
||||||
#i3 = url.find('3')
|
# i3 = url.find('3')
|
||||||
#if i3 >= 0 and i0 < minIdx:
|
# if i3 >= 0 and i0 < minIdx:
|
||||||
# minIdx = i3
|
# minIdx = i3
|
||||||
#i4 = url.find('4')
|
# i4 = url.find('4')
|
||||||
#if i4 >= 0 and i4 < minIdx:
|
# if i4 >= 0 and i4 < minIdx:
|
||||||
# minIdx = i4
|
# minIdx = i4
|
||||||
#i5 = url.find('5')
|
# i5 = url.find('5')
|
||||||
#if i5 >= 0 and i5 < minIdx:
|
# if i5 >= 0 and i5 < minIdx:
|
||||||
# minIdx = i5
|
# minIdx = i5
|
||||||
#i6 = url.find('6')
|
# i6 = url.find('6')
|
||||||
#if i6 >= 0 and i6 < minIdx:
|
# if i6 >= 0 and i6 < minIdx:
|
||||||
# minIdx = i6
|
# minIdx = i6
|
||||||
#i7 = url.find('7')
|
# i7 = url.find('7')
|
||||||
#if i7 >= 0 and i7 < minIdx:
|
# if i7 >= 0 and i7 < minIdx:
|
||||||
# minIdx = i7
|
# minIdx = i7
|
||||||
#i8 = url.find('8')
|
# i8 = url.find('8')
|
||||||
#if i8 >= 0 and i8 < minIdx:
|
# if i8 >= 0 and i8 < minIdx:
|
||||||
# minIdx = i8
|
# minIdx = i8
|
||||||
#i9 = url.find('9')
|
# i9 = url.find('9')
|
||||||
#if i9 >= 0 and i9 < minIdx:
|
# if i9 >= 0 and i9 < minIdx:
|
||||||
# minIdx = i9
|
# minIdx = i9
|
||||||
#return url[0:minIdx] + '_' + url[minIdx+1:]
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def get_fetchdate(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||||
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||||
return dt_local.strftime("%Y%m%d")
|
return dt_local
|
||||||
|
|
||||||
|
def get_fetchdate(self):
|
||||||
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
def get_fetchday(self):
|
||||||
|
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||||
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
@ -127,9 +146,9 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
# if eco_articles:
|
# if eco_articles:
|
||||||
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
|
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
|
||||||
# special - entertainment
|
# special - entertainment
|
||||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||||
#if ent_articles:
|
if ent_articles:
|
||||||
# feeds.append(('Entertainment', ent_articles))
|
feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_section(self, url):
|
def parse_section(self, url):
|
||||||
@ -164,6 +183,7 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
def parse_eco_section(self, url):
|
def parse_eco_section(self, url):
|
||||||
|
dateStr = self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
divs = soup.findAll(attrs={'class': ['bullet']})
|
divs = soup.findAll(attrs={'class': ['bullet']})
|
||||||
current_articles = []
|
current_articles = []
|
||||||
@ -173,23 +193,25 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
|
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
|
||||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
|
||||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
#def parse_ent_section(self, url):
|
def parse_ent_section(self, url):
|
||||||
# dateStr = self.get_fetchdate()
|
soup = self.index_to_soup(url)
|
||||||
# soup = self.index_to_soup(url)
|
a = soup.findAll('a', href=True)
|
||||||
# a = soup.findAll('a', href=True)
|
a.reverse()
|
||||||
# current_articles = []
|
current_articles = []
|
||||||
# included_urls = []
|
included_urls = []
|
||||||
# for i in a:
|
for i in a:
|
||||||
# title = self.tag_to_string(i)
|
title = self.tag_to_string(i)
|
||||||
# url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||||
# if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '':
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
|
||||||
# current_articles.append({'title': title, 'url': url, 'description': ''})
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
# return current_articles
|
included_urls.append(url)
|
||||||
|
current_articles.reverse()
|
||||||
|
return current_articles
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
@ -201,21 +223,26 @@ class MPHKRecipe(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
#super(MPHKRecipe,self).create_opf(feeds, dir)
|
if self.IsKindleUsed == False:
|
||||||
|
super(MPHKRecipe,self).create_opf(feeds, dir)
|
||||||
|
return
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
title = self.short_title()
|
title = self.short_title()
|
||||||
if self.output_profile.periodical_date_in_title:
|
title += ' ' + self.get_fetchdate()
|
||||||
title += strftime(self.timefmt)
|
#if self.output_profile.periodical_date_in_title:
|
||||||
|
# title += strftime(self.timefmt)
|
||||||
mi = MetaInformation(title, [__appname__])
|
mi = MetaInformation(title, [__appname__])
|
||||||
mi.publisher = __appname__
|
mi.publisher = __appname__
|
||||||
mi.author_sort = __appname__
|
mi.author_sort = __appname__
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
mi.timestamp = nowf()
|
#mi.timestamp = nowf()
|
||||||
|
mi.timestamp = self.get_dtlocal()
|
||||||
mi.comments = self.description
|
mi.comments = self.description
|
||||||
if not isinstance(mi.comments, unicode):
|
if not isinstance(mi.comments, unicode):
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
mi.pubdate = nowf()
|
#mi.pubdate = nowf()
|
||||||
|
mi.pubdate = self.get_dtlocal()
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
opf = OPFCreator(dir, mi)
|
opf = OPFCreator(dir, mi)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user