Update Ming Pao

2025-08-11 09:13:57 -04:00 · 2011-02-20 10:34:41 -07:00 · 2011-02-20 10:34:41 -07:00 · 15d1e591ae
commit 15d1e591ae
parent dba8af1f37
1 changed files with 125 additions and 136 deletions
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@ -1,7 +1,9 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Eddie Lau'
+__copyright__ = '2010-2011, Eddie Lau'
 '''
 Change Log:
+2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
+            clean up the indentation
 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
 2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested


-from calibre import __appname__
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation

 class MPHKRecipe(BasicNewsRecipe):
-    IsKindleUsed = True  # to avoid generating periodical in which CJK characters can't be displayed in section/article view
-
+    IsCJKWellSupported = True  # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
    title          = 'Ming Pao - Hong Kong'
    oldest_article = 1
    max_articles_per_feed = 100
    __author__            = 'Eddie Lau'
-    description           = 'Hong Kong Chinese Newspaper'
-    publisher             = 'news.mingpao.com'
+    description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+    publisher             = 'MingPao'
    category              = 'Chinese, News, Hong Kong'
    remove_javascript = True
    use_embedded_content   = False
@ -46,9 +46,10 @@ class MPHKRecipe(BasicNewsRecipe):
    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
    keep_only_tags = [dict(name='h1'),
                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
-                      dict(attrs={'class':['photo']}),
                      dict(attrs={'id':['newscontent']}), # entertainment page content
-                      dict(attrs={'id':['newscontent01','newscontent02']})]
+                      dict(attrs={'id':['newscontent01','newscontent02']}),
+                      dict(attrs={'class':['photo']})
+                      ]
    remove_tags = [dict(name='style'),
                   dict(attrs={'id':['newscontent135']})]  # for the finance page
    remove_attributes = ['width']
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
    def get_fetchdate(self):
        return self.get_dtlocal().strftime("%Y%m%d")

+    def get_fetchformatteddate(self):
+        return self.get_dtlocal().strftime("%Y-%m-%d")
+
    def get_fetchday(self):
        # convert UTC to local hk time - at around HKT 6.00am, all news are available
        return self.get_dtlocal().strftime("%d")
@ -124,13 +128,13 @@ class MPHKRecipe(BasicNewsRecipe):
        feeds = []
        dateStr = self.get_fetchdate()
        for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
-                               (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
                           (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                               (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
+                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
                           (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
                           ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                           (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
                           (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -141,14 +145,10 @@ class MPHKRecipe(BasicNewsRecipe):
        fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
        if fin_articles:
            feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-            # special - eco-friendly
-            # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
-            # if eco_articles:
-            #   feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
        # special - entertainment
        ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
        if ent_articles:
-                feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
+            feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
        return feeds

    def parse_section(self, url):
@ -174,31 +174,17 @@ class MPHKRecipe(BasicNewsRecipe):
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href= True)
        current_articles = []
-        for i in a:
-            url = i.get('href', False)
-            if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
-                title = self.tag_to_string(i)
-                url = 'http://www.mpfinance.com/cfm/' +url
-                current_articles.append({'title': title, 'url': url, 'description':''})
-        return current_articles
-
-    def parse_eco_section(self, url):
-        dateStr = self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['bullet']})
-        current_articles = []
        included_urls = []
-        for i in divs:
-            a = i.find('a', href = True)
-            title = self.tag_to_string(a)
-            url = a.get('href', False)
-            url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
-            if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
+        for i in a:
+            url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+            if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+                title = self.tag_to_string(i)
                current_articles.append({'title': title, 'url': url, 'description':''})
                included_urls.append(url)
        return current_articles

    def parse_ent_section(self, url):
+        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
@ -223,18 +209,22 @@ class MPHKRecipe(BasicNewsRecipe):
        return soup

    def create_opf(self, feeds, dir=None):
-        if self.IsKindleUsed == False:
-            super(MPHKRecipe,self).create_opf(feeds, dir)
-            return
        if dir is None:
            dir = self.output_dir
-        title = self.short_title()
-        title += ' ' + self.get_fetchdate()
-        #if self.output_profile.periodical_date_in_title:
+        if self.IsCJKWellSupported == True:
+            # use Chinese title
+            title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
+        else:
+            # use English title
+            title = self.short_title() + ' ' + self.get_fetchformatteddate()
+        if True:  # force date in title
            #    title += strftime(self.timefmt)
-        mi = MetaInformation(title, [__appname__])
-        mi.publisher = __appname__
-        mi.author_sort = __appname__
+            mi = MetaInformation(title, [self.publisher])
+            mi.publisher = self.publisher
+            mi.author_sort = self.publisher
+            if self.IsCJKWellSupported == True:
+                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+            else:
                mi.publication_type = self.publication_type+':'+self.short_title()
            #mi.timestamp = nowf()
            mi.timestamp = self.get_dtlocal()
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, __appname__, prefix=prefix,
+                                            a.orig_url, self.publisher, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):

        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
-