Update Ming Pao

2025-12-23 13:27:20 -05:00 · 2011-02-20 10:34:41 -07:00 · 2011-02-20 10:34:41 -07:00 · 15d1e591ae
commit 15d1e591ae
parent dba8af1f37
1 changed files with 125 additions and 136 deletions
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@ -1,7 +1,9 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Eddie Lau'
+__copyright__ = '2010-2011, Eddie Lau'
 '''
 Change Log:
+2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
+            clean up the indentation
 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
 2010/11/22: add English section, remove eco-news section which is not updated daily, correct
@ -18,21 +20,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested


-from calibre import __appname__
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation

 class MPHKRecipe(BasicNewsRecipe):
-    IsKindleUsed = True  # to avoid generating periodical in which CJK characters can't be displayed in section/article view
-
+    IsCJKWellSupported = True  # Set to False to avoid generating periodical in which CJK characters can't be displayed in section/article view
    title          = 'Ming Pao - Hong Kong'
    oldest_article = 1
    max_articles_per_feed = 100
    __author__            = 'Eddie Lau'
-    description           = 'Hong Kong Chinese Newspaper'
-    publisher             = 'news.mingpao.com'
+    description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+    publisher             = 'MingPao'
    category              = 'Chinese, News, Hong Kong'
    remove_javascript = True
    use_embedded_content   = False
@ -46,19 +46,20 @@ class MPHKRecipe(BasicNewsRecipe):
    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
    keep_only_tags = [dict(name='h1'),
                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
-                      dict(attrs={'class':['photo']}),
                      dict(attrs={'id':['newscontent']}), # entertainment page content
-                      dict(attrs={'id':['newscontent01','newscontent02']})]
+                      dict(attrs={'id':['newscontent01','newscontent02']}),
+                      dict(attrs={'class':['photo']})
+                      ]
    remove_tags = [dict(name='style'),
                   dict(attrs={'id':['newscontent135']})]  # for the finance page
    remove_attributes = ['width']
    preprocess_regexps = [
-                            (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
-                            lambda match: '<h1>'),
-                            (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
-                            lambda match: '</h1>'),
-                            (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
-                            lambda match: '')
+                          (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
+                          lambda match: '<h1>'),
+                          (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
+                          lambda match: '</h1>'),
+                          (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
+                          lambda match: '')
                         ]

    def image_url_processor(cls, baseurl, url):
@ -107,6 +108,9 @@ class MPHKRecipe(BasicNewsRecipe):
    def get_fetchdate(self):
        return self.get_dtlocal().strftime("%Y%m%d")

+    def get_fetchformatteddate(self):
+        return self.get_dtlocal().strftime("%Y-%m-%d")
+
    def get_fetchday(self):
        # convert UTC to local hk time - at around HKT 6.00am, all news are available
        return self.get_dtlocal().strftime("%d")
@ -121,84 +125,66 @@ class MPHKRecipe(BasicNewsRecipe):
        return cover

    def parse_index(self):
-            feeds = []
-            dateStr = self.get_fetchdate()
-            for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
-                               (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
-                               (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                               (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
-                               (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
-                               (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
-                               ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-                               (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
-                               (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                               (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-                articles = self.parse_section(url)
-                if articles:
-                    feeds.append((title, articles))
-            # special - finance
-            fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-            if fin_articles:
-                feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-            # special - eco-friendly
-            # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
-            # if eco_articles:
-            #   feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
-            # special - entertainment
-            ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-            if ent_articles:
-                feeds.append((u'\u5f71\u8996 Entertainment', ent_articles))
-            return feeds
+        feeds = []
+        dateStr = self.get_fetchdate()
+        for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+                           (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
+                           (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
+                           ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                           (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
+                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
+                           (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+            articles = self.parse_section(url)
+            if articles:
+                feeds.append((title, articles))
+        # special - finance
+        fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+        if fin_articles:
+            feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+        # special - entertainment
+        ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+        if ent_articles:
+            feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+        return feeds

    def parse_section(self, url):
-            dateStr = self.get_fetchdate()
-            soup = self.index_to_soup(url)
-            divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
-            current_articles = []
-            included_urls = []
-            divs.reverse()
-            for i in divs:
-                a = i.find('a', href = True)
-                title = self.tag_to_string(a)
-                url = a.get('href', False)
-                url = 'http://news.mingpao.com/' + dateStr + '/' +url
-                if url not in included_urls and url.rfind('Redirect') == -1:
-                    current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
-                    included_urls.append(url)
-            current_articles.reverse()
-            return current_articles
+        dateStr = self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+        current_articles = []
+        included_urls = []
+        divs.reverse()
+        for i in divs:
+            a = i.find('a', href = True)
+            title = self.tag_to_string(a)
+            url = a.get('href', False)
+            url = 'http://news.mingpao.com/' + dateStr + '/' +url
+            if url not in included_urls and url.rfind('Redirect') == -1:
+                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles

    def parse_fin_section(self, url):
        dateStr = self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href= True)
        current_articles = []
-        for i in a:
-            url = i.get('href', False)
-            if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
-                title = self.tag_to_string(i)
-                url = 'http://www.mpfinance.com/cfm/' +url
-                current_articles.append({'title': title, 'url': url, 'description':''})
-        return current_articles
-
-    def parse_eco_section(self, url):
-        dateStr = self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['bullet']})
-        current_articles = []
        included_urls = []
-        for i in divs:
-            a = i.find('a', href = True)
-            title = self.tag_to_string(a)
-            url = a.get('href', False)
-            url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
-            if url not in included_urls and url.rfind('Redirect') == -1 and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1:
+        for i in a:
+            url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+            if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+                title = self.tag_to_string(i)
                current_articles.append({'title': title, 'url': url, 'description':''})
                included_urls.append(url)
        return current_articles

    def parse_ent_section(self, url):
+        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
@ -223,67 +209,71 @@ class MPHKRecipe(BasicNewsRecipe):
        return soup

    def create_opf(self, feeds, dir=None):
-        if self.IsKindleUsed == False:
-            super(MPHKRecipe,self).create_opf(feeds, dir)
-            return
        if dir is None:
            dir = self.output_dir
-        title = self.short_title()
-        title += ' ' + self.get_fetchdate()
-        #if self.output_profile.periodical_date_in_title:
-        #    title += strftime(self.timefmt)
-        mi = MetaInformation(title, [__appname__])
-        mi.publisher = __appname__
-        mi.author_sort = __appname__
-        mi.publication_type = self.publication_type+':'+self.short_title()
-        #mi.timestamp = nowf()
-        mi.timestamp = self.get_dtlocal()
-        mi.comments = self.description
-        if not isinstance(mi.comments, unicode):
-            mi.comments = mi.comments.decode('utf-8', 'replace')
-        #mi.pubdate = nowf()
-        mi.pubdate = self.get_dtlocal()
-        opf_path = os.path.join(dir, 'index.opf')
-        ncx_path = os.path.join(dir, 'index.ncx')
-        opf = OPFCreator(dir, mi)
-        # Add mastheadImage entry to <guide> section
-        mp = getattr(self, 'masthead_path', None)
-        if mp is not None and os.access(mp, os.R_OK):
-            from calibre.ebooks.metadata.opf2 import Guide
-            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
-            ref.type = 'masthead'
-            ref.title = 'Masthead Image'
-            opf.guide.append(ref)
+        if self.IsCJKWellSupported == True:
+            # use Chinese title
+            title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
+        else:
+            # use English title
+            title = self.short_title() + ' ' + self.get_fetchformatteddate()
+        if True:  # force date in title
+            #    title += strftime(self.timefmt)
+            mi = MetaInformation(title, [self.publisher])
+            mi.publisher = self.publisher
+            mi.author_sort = self.publisher
+            if self.IsCJKWellSupported == True:
+                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+            else:
+                mi.publication_type = self.publication_type+':'+self.short_title()
+            #mi.timestamp = nowf()
+            mi.timestamp = self.get_dtlocal()
+            mi.comments = self.description
+            if not isinstance(mi.comments, unicode):
+                mi.comments = mi.comments.decode('utf-8', 'replace')
+            #mi.pubdate = nowf()
+            mi.pubdate = self.get_dtlocal()
+            opf_path = os.path.join(dir, 'index.opf')
+            ncx_path = os.path.join(dir, 'index.ncx')
+            opf = OPFCreator(dir, mi)
+            # Add mastheadImage entry to <guide> section
+            mp = getattr(self, 'masthead_path', None)
+            if mp is not None and os.access(mp, os.R_OK):
+                from calibre.ebooks.metadata.opf2 import Guide
+                ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+                ref.type = 'masthead'
+                ref.title = 'Masthead Image'
+                opf.guide.append(ref)

-        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
-        manifest.append(os.path.join(dir, 'index.html'))
-        manifest.append(os.path.join(dir, 'index.ncx'))
+            manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+            manifest.append(os.path.join(dir, 'index.html'))
+            manifest.append(os.path.join(dir, 'index.ncx'))

-        # Get cover
-        cpath = getattr(self, 'cover_path', None)
-        if cpath is None:
-            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
-            if self.default_cover(pf):
-                cpath =  pf.name
-        if cpath is not None and os.access(cpath, os.R_OK):
-            opf.cover = cpath
-            manifest.append(cpath)
+            # Get cover
+            cpath = getattr(self, 'cover_path', None)
+            if cpath is None:
+                pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+                if self.default_cover(pf):
+                    cpath =  pf.name
+            if cpath is not None and os.access(cpath, os.R_OK):
+                opf.cover = cpath
+                manifest.append(cpath)

-        # Get masthead
-        mpath = getattr(self, 'masthead_path', None)
-        if mpath is not None and os.access(mpath, os.R_OK):
-            manifest.append(mpath)
+            # Get masthead
+            mpath = getattr(self, 'masthead_path', None)
+            if mpath is not None and os.access(mpath, os.R_OK):
+                manifest.append(mpath)

-        opf.create_manifest_from_files_in(manifest)
-        for mani in opf.manifest:
-            if mani.path.endswith('.ncx'):
-                mani.id = 'ncx'
-            if mani.path.endswith('mastheadImage.jpg'):
-                mani.id = 'masthead-image'
-        entries = ['index.html']
-        toc = TOC(base_path=dir)
-        self.play_order_counter = 0
-        self.play_order_map = {}
+            opf.create_manifest_from_files_in(manifest)
+            for mani in opf.manifest:
+                if mani.path.endswith('.ncx'):
+                    mani.id = 'ncx'
+                if mani.path.endswith('mastheadImage.jpg'):
+                    mani.id = 'masthead-image'
+            entries = ['index.html']
+            toc = TOC(base_path=dir)
+            self.play_order_counter = 0
+            self.play_order_map = {}

        def feed_index(num, parent):
            f = feeds[num]
@ -321,7 +311,7 @@ class MPHKRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, __appname__, prefix=prefix,
+                                            a.orig_url, self.publisher, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -344,7 +334,7 @@ class MPHKRecipe(BasicNewsRecipe):
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
-                    f.title, play_order=po, description=desc, author=auth))
+                           f.title, play_order=po, description=desc, author=auth))

        else:
            entries.append('feed_%d/index.html'%0)
@ -357,4 +347,3 @@ class MPHKRecipe(BasicNewsRecipe):

        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
-