mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	Update Ming Pao
This commit is contained in:
		
							parent
							
								
									dba8af1f37
								
							
						
					
					
						commit
						15d1e591ae
					
				@ -1,7 +1,9 @@
 | 
				
			|||||||
__license__   = 'GPL v3'
 | 
					__license__   = 'GPL v3'
 | 
				
			||||||
__copyright__ = '2010, Eddie Lau'
 | 
					__copyright__ = '2010-2011, Eddie Lau'
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
Change Log:
 | 
					Change Log:
 | 
				
			||||||
 | 
					2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
 | 
				
			||||||
 | 
					            clean up the indentation
 | 
				
			||||||
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
 | 
					2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
 | 
				
			||||||
            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
 | 
					            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
 | 
				
			||||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
 | 
					2010/11/22: add English section, remove eco-news section which is not updated daily, correct
 | 
				
			||||||
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
 | 
				
			|||||||
from contextlib import nested
 | 
					from contextlib import nested
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from calibre import __appname__
 | 
					 | 
				
			||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | 
					from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | 
				
			||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
 | 
					from calibre.ebooks.metadata.opf2 import OPFCreator
 | 
				
			||||||
from calibre.ebooks.metadata.toc import TOC
 | 
					from calibre.ebooks.metadata.toc import TOC
 | 
				
			||||||
from calibre.ebooks.metadata import MetaInformation
 | 
					from calibre.ebooks.metadata import MetaInformation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MPHKRecipe(BasicNewsRecipe):
 | 
					class MPHKRecipe(BasicNewsRecipe):
 | 
				
			||||||
    IsKindleUsed = True  # to avoid generating periodical in which CJK characters can't be displayed in section/article view
 | 
					    IsCJKWellSupported = True  # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
 | 
				
			||||||
 | 
					 | 
				
			||||||
    title          = 'Ming Pao - Hong Kong'
 | 
					    title          = 'Ming Pao - Hong Kong'
 | 
				
			||||||
    oldest_article = 1
 | 
					    oldest_article = 1
 | 
				
			||||||
    max_articles_per_feed = 100
 | 
					    max_articles_per_feed = 100
 | 
				
			||||||
    __author__            = 'Eddie Lau'
 | 
					    __author__            = 'Eddie Lau'
 | 
				
			||||||
    description           = 'Hong Kong Chinese Newspaper'
 | 
					    description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
 | 
				
			||||||
    publisher             = 'news.mingpao.com'
 | 
					    publisher             = 'MingPao'
 | 
				
			||||||
    category              = 'Chinese, News, Hong Kong'
 | 
					    category              = 'Chinese, News, Hong Kong'
 | 
				
			||||||
    remove_javascript = True
 | 
					    remove_javascript = True
 | 
				
			||||||
    use_embedded_content   = False
 | 
					    use_embedded_content   = False
 | 
				
			||||||
@ -46,9 +46,10 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
 | 
					    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
 | 
				
			||||||
    keep_only_tags = [dict(name='h1'),
 | 
					    keep_only_tags = [dict(name='h1'),
 | 
				
			||||||
                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
 | 
					                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
 | 
				
			||||||
                      dict(attrs={'class':['photo']}),
 | 
					 | 
				
			||||||
                      dict(attrs={'id':['newscontent']}), # entertainment page content
 | 
					                      dict(attrs={'id':['newscontent']}), # entertainment page content
 | 
				
			||||||
                      dict(attrs={'id':['newscontent01','newscontent02']})]
 | 
					                      dict(attrs={'id':['newscontent01','newscontent02']}),
 | 
				
			||||||
 | 
					                      dict(attrs={'class':['photo']})
 | 
				
			||||||
 | 
					                      ]
 | 
				
			||||||
    remove_tags = [dict(name='style'),
 | 
					    remove_tags = [dict(name='style'),
 | 
				
			||||||
                   dict(attrs={'id':['newscontent135']})]  # for the finance page
 | 
					                   dict(attrs={'id':['newscontent135']})]  # for the finance page
 | 
				
			||||||
    remove_attributes = ['width']
 | 
					    remove_attributes = ['width']
 | 
				
			||||||
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
    def get_fetchdate(self):
 | 
					    def get_fetchdate(self):
 | 
				
			||||||
        return self.get_dtlocal().strftime("%Y%m%d")
 | 
					        return self.get_dtlocal().strftime("%Y%m%d")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_fetchformatteddate(self):
 | 
				
			||||||
 | 
					        return self.get_dtlocal().strftime("%Y-%m-%d")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_fetchday(self):
 | 
					    def get_fetchday(self):
 | 
				
			||||||
        # convert UTC to local hk time - at around HKT 6.00am, all news are available
 | 
					        # convert UTC to local hk time - at around HKT 6.00am, all news are available
 | 
				
			||||||
        return self.get_dtlocal().strftime("%d")
 | 
					        return self.get_dtlocal().strftime("%d")
 | 
				
			||||||
@ -124,13 +128,13 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
        feeds = []
 | 
					        feeds = []
 | 
				
			||||||
        dateStr = self.get_fetchdate()
 | 
					        dateStr = self.get_fetchdate()
 | 
				
			||||||
        for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
 | 
					        for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
 | 
				
			||||||
                               (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
 | 
					 | 
				
			||||||
                           (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
 | 
					                           (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
 | 
				
			||||||
                               (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
 | 
					                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
 | 
				
			||||||
                           (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
 | 
					                           (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
 | 
				
			||||||
                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
 | 
					                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
 | 
				
			||||||
                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
 | 
					                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
 | 
				
			||||||
                           ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
 | 
					                           ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
 | 
				
			||||||
 | 
					                           (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
 | 
				
			||||||
                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
 | 
					                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
 | 
				
			||||||
                           (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
 | 
					                           (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
 | 
				
			||||||
                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
 | 
					                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
 | 
				
			||||||
@ -141,14 +145,10 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
        fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
 | 
					        fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
 | 
				
			||||||
        if fin_articles:
 | 
					        if fin_articles:
 | 
				
			||||||
            feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
 | 
					            feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
 | 
				
			||||||
            # special - eco-friendly
 | 
					 | 
				
			||||||
            # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
 | 
					 | 
				
			||||||
            # if eco_articles:
 | 
					 | 
				
			||||||
            #   feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
 | 
					 | 
				
			||||||
        # special - entertainment
 | 
					        # special - entertainment
 | 
				
			||||||
        ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
 | 
					        ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
 | 
				
			||||||
        if ent_articles:
 | 
					        if ent_articles:
 | 
				
			||||||
                feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
 | 
					            feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
 | 
				
			||||||
        return feeds
 | 
					        return feeds
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def parse_section(self, url):
 | 
					    def parse_section(self, url):
 | 
				
			||||||
@ -174,31 +174,17 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
        soup = self.index_to_soup(url)
 | 
					        soup = self.index_to_soup(url)
 | 
				
			||||||
        a = soup.findAll('a', href= True)
 | 
					        a = soup.findAll('a', href= True)
 | 
				
			||||||
        current_articles = []
 | 
					        current_articles = []
 | 
				
			||||||
        for i in a:
 | 
					 | 
				
			||||||
            url = i.get('href', False)
 | 
					 | 
				
			||||||
            if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
 | 
					 | 
				
			||||||
                title = self.tag_to_string(i)
 | 
					 | 
				
			||||||
                url = 'http://www.mpfinance.com/cfm/' +url
 | 
					 | 
				
			||||||
                current_articles.append({'title': title, 'url': url, 'description':''})
 | 
					 | 
				
			||||||
        return current_articles
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def parse_eco_section(self, url):
 | 
					 | 
				
			||||||
        dateStr = self.get_fetchdate()
 | 
					 | 
				
			||||||
        soup = self.index_to_soup(url)
 | 
					 | 
				
			||||||
        divs = soup.findAll(attrs={'class': ['bullet']})
 | 
					 | 
				
			||||||
        current_articles = []
 | 
					 | 
				
			||||||
        included_urls = []
 | 
					        included_urls = []
 | 
				
			||||||
        for i in divs:
 | 
					        for i in a:
 | 
				
			||||||
            a = i.find('a', href = True)
 | 
					            url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
 | 
				
			||||||
            title = self.tag_to_string(a)
 | 
					            if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
 | 
				
			||||||
            url = a.get('href', False)
 | 
					                title = self.tag_to_string(i)
 | 
				
			||||||
            url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
 | 
					 | 
				
			||||||
            if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
 | 
					 | 
				
			||||||
                current_articles.append({'title': title, 'url': url, 'description':''})
 | 
					                current_articles.append({'title': title, 'url': url, 'description':''})
 | 
				
			||||||
                included_urls.append(url)
 | 
					                included_urls.append(url)
 | 
				
			||||||
        return current_articles
 | 
					        return current_articles
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def parse_ent_section(self, url):
 | 
					    def parse_ent_section(self, url):
 | 
				
			||||||
 | 
					        self.get_fetchdate()
 | 
				
			||||||
        soup = self.index_to_soup(url)
 | 
					        soup = self.index_to_soup(url)
 | 
				
			||||||
        a = soup.findAll('a', href=True)
 | 
					        a = soup.findAll('a', href=True)
 | 
				
			||||||
        a.reverse()
 | 
					        a.reverse()
 | 
				
			||||||
@ -223,18 +209,22 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
        return soup
 | 
					        return soup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def create_opf(self, feeds, dir=None):
 | 
					    def create_opf(self, feeds, dir=None):
 | 
				
			||||||
        if self.IsKindleUsed == False:
 | 
					 | 
				
			||||||
            super(MPHKRecipe,self).create_opf(feeds, dir)
 | 
					 | 
				
			||||||
            return
 | 
					 | 
				
			||||||
        if dir is None:
 | 
					        if dir is None:
 | 
				
			||||||
            dir = self.output_dir
 | 
					            dir = self.output_dir
 | 
				
			||||||
        title = self.short_title()
 | 
					        if self.IsCJKWellSupported == True:
 | 
				
			||||||
        title += ' ' + self.get_fetchdate()
 | 
					            # use Chinese title
 | 
				
			||||||
        #if self.output_profile.periodical_date_in_title:
 | 
					            title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            # use English title
 | 
				
			||||||
 | 
					            title = self.short_title() + ' ' + self.get_fetchformatteddate()
 | 
				
			||||||
 | 
					        if True:  # force date in title
 | 
				
			||||||
            #    title += strftime(self.timefmt)
 | 
					            #    title += strftime(self.timefmt)
 | 
				
			||||||
        mi = MetaInformation(title, [__appname__])
 | 
					            mi = MetaInformation(title, [self.publisher])
 | 
				
			||||||
        mi.publisher = __appname__
 | 
					            mi.publisher = self.publisher
 | 
				
			||||||
        mi.author_sort = __appname__
 | 
					            mi.author_sort = self.publisher
 | 
				
			||||||
 | 
					            if self.IsCJKWellSupported == True:
 | 
				
			||||||
 | 
					                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
                mi.publication_type = self.publication_type+':'+self.short_title()
 | 
					                mi.publication_type = self.publication_type+':'+self.short_title()
 | 
				
			||||||
            #mi.timestamp = nowf()
 | 
					            #mi.timestamp = nowf()
 | 
				
			||||||
            mi.timestamp = self.get_dtlocal()
 | 
					            mi.timestamp = self.get_dtlocal()
 | 
				
			||||||
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
 | 
					                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
 | 
				
			||||||
                            templ = self.navbar.generate(True, num, j, len(f),
 | 
					                            templ = self.navbar.generate(True, num, j, len(f),
 | 
				
			||||||
                                            not self.has_single_feed,
 | 
					                                            not self.has_single_feed,
 | 
				
			||||||
                                            a.orig_url, __appname__, prefix=prefix,
 | 
					                                            a.orig_url, self.publisher, prefix=prefix,
 | 
				
			||||||
                                            center=self.center_navbar)
 | 
					                                            center=self.center_navbar)
 | 
				
			||||||
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
 | 
					                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
 | 
				
			||||||
                            body.insert(len(body.contents), elem)
 | 
					                            body.insert(len(body.contents), elem)
 | 
				
			||||||
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
 | 
					        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
 | 
				
			||||||
            opf.render(opf_file, ncx_file)
 | 
					            opf.render(opf_file, ncx_file)
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user