Update Mingpao

2026-03-21 00:47:52 -04:00 · 2010-11-23 08:49:19 -07:00 · 2010-11-23 08:49:19 -07:00 · eaeee277f0
commit eaeee277f0
parent 3118a5dac3
1 changed files with 284 additions and 12 deletions
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau'
 '''
 modified from Singtao Toronto calibre recipe by rty
 Change Log:
+2010/11/22: add English section, remove eco-news section which is not updated daily, correct
+            ordering of articles
+2010/11/12: add news image and eco-news section
+2010/11/08: add parsing of finance section
+2010/11/06: temporary work-around for Kindle device having no capability to display unicode
+            in section/article list.
 2010/10/31: skip repeated articles in section pages
 '''

-import datetime
+import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from contextlib import nested

-class AdvancedUserRecipe1278063072(BasicNewsRecipe):
+
+from calibre import __appname__, strftime
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.date import now as nowf
+
+class MPHKRecipe(BasicNewsRecipe):
    title          = 'Ming Pao - Hong Kong'
    oldest_article = 1
    max_articles_per_feed = 100
@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
    encoding = 'Big5-HKSCS'
    recursions = 0
    conversion_options = {'linearize_tables':True}
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}'
+    #extra_css = 'img {float:right; margin:4px;}'
    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
    keep_only_tags = [dict(name='h1'),
+                      #dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page
+                      dict(attrs={'class':['photo']}),
+                      dict(attrs={'id':['newscontent']}),
                      dict(attrs={'id':['newscontent01','newscontent02']})]
+    remove_tags = [dict(name='style'),
+                   dict(attrs={'id':['newscontent135']})]  # for the finance page
+    remove_attributes = ['width']
+    preprocess_regexps = [
+                            (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
+                            lambda match: '<h1>'),
+                            (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
+                            lambda match: '</h1>'),
+                         ]
+
+    def image_url_processor(cls, baseurl, url):
+        # trick: break the url at the first occurance of digit, add an additional
+        # '_' at the front
+        # not working, may need to move this to preprocess_html() method
+        #minIdx = 10000
+        #i0 = url.find('0')
+        #if i0 >= 0 and i0 < minIdx:
+        #   minIdx = i0
+        #i1 = url.find('1')
+        #if i1 >= 0 and i1 < minIdx:
+        #   minIdx = i1
+        #i2 = url.find('2')
+        #if i2 >= 0 and i2 < minIdx:
+        #   minIdx = i2
+        #i3 = url.find('3')
+        #if i3 >= 0 and i0 < minIdx:
+        #   minIdx = i3
+        #i4 = url.find('4')
+        #if i4 >= 0 and i4 < minIdx:
+        #   minIdx = i4
+        #i5 = url.find('5')
+        #if i5 >= 0 and i5 < minIdx:
+        #   minIdx = i5
+        #i6 = url.find('6')
+        #if i6 >= 0 and i6 < minIdx:
+        #   minIdx = i6
+        #i7 = url.find('7')
+        #if i7 >= 0 and i7 < minIdx:
+        #   minIdx = i7
+        #i8 = url.find('8')
+        #if i8 >= 0 and i8 < minIdx:
+        #   minIdx = i8
+        #i9 = url.find('9')
+        #if i9 >= 0 and i9 < minIdx:
+        #   minIdx = i9
+        #return url[0:minIdx] + '_' + url[minIdx+1:]
+        return url

    def get_fetchdate(self):
        dt_utc = datetime.datetime.utcnow()
-        # convert UTC to local hk time - at around HKT 5.30am, all news are available
-        dt_local = dt_utc - datetime.timedelta(-2.5/24)
+        # convert UTC to local hk time - at around HKT 6.00am, all news are available
+        dt_local = dt_utc - datetime.timedelta(-2.0/24)
        return dt_local.strftime("%Y%m%d")

    def parse_index(self):
-        feeds = []
-        dateStr = self.get_fetchdate()
-        for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]:
-            articles = self.parse_section(url)
-            if articles:
-                feeds.append((title, articles))
+            feeds = []
+            dateStr = self.get_fetchdate()
+            for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+                               (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
+                               (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+                               (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
+                               (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+                               (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+                               (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
+                               ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                               (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
+                               (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                               (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+                articles = self.parse_section(url)
+                if articles:
+                    feeds.append((title, articles))
+            # special - finance
+            fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+            if fin_articles:
+                feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+            # special - eco-friendly
+            # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
+            # if eco_articles:
+            #   feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
+            # special - entertainment
+            #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+            #if ent_articles:
+            #   feeds.append(('Entertainment', ent_articles))
            return feeds

    def parse_section(self, url):
+            dateStr = self.get_fetchdate()
+            soup = self.index_to_soup(url)
+            divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+            current_articles = []
+            included_urls = []
+            divs.reverse()
+            for i in divs:
+                a = i.find('a', href = True)
+                title = self.tag_to_string(a)
+                url = a.get('href', False)
+                url = 'http://news.mingpao.com/' + dateStr + '/' +url
+                if url not in included_urls and url.rfind('Redirect') == -1:
+                    current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+                    included_urls.append(url)
+            current_articles.reverse()
+            return current_articles
+
+    def parse_fin_section(self, url):
        dateStr = self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href= True)
+        current_articles = []
+        for i in a:
+            url = i.get('href', False)
+            if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+                title = self.tag_to_string(i)
+                url = 'http://www.mpfinance.com/cfm/' +url
+                current_articles.append({'title': title, 'url': url, 'description':''})
+        return current_articles
+
+    def parse_eco_section(self, url):
        soup = self.index_to_soup(url)
        divs = soup.findAll(attrs={'class': ['bullet']})
        current_articles = []
@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
            a = i.find('a', href = True)
            title = self.tag_to_string(a)
            url = a.get('href', False)
-            url = 'http://news.mingpao.com/' + dateStr + '/' +url
-            if url not in included_urls:
+            url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
+            if url not in included_urls and url.rfind('Redirect') == -1:
                current_articles.append({'title': title, 'url': url, 'description':''})
                included_urls.append(url)
        return current_articles

+    #def parse_ent_section(self, url):
+    #   dateStr = self.get_fetchdate()
+    #   soup = self.index_to_soup(url)
+    #   a = soup.findAll('a', href=True)
+    #   current_articles = []
+    #   included_urls = []
+    #   for i in a:
+    #       title = self.tag_to_string(i)
+    #       url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
+    #       if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '':
+    #           current_articles.append({'title': title, 'url': url, 'description': ''})
+    #   return current_articles
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        for item in soup.findAll(style=True):
+            del item['width']
+        for item in soup.findAll(stype=True):
+            del item['absmiddle']
+        return soup
+
+    def create_opf(self, feeds, dir=None):
+        #super(MPHKRecipe,self).create_opf(feeds, dir)
+        if dir is None:
+            dir = self.output_dir
+        title = self.short_title()
+        if self.output_profile.periodical_date_in_title:
+            title += strftime(self.timefmt)
+        mi = MetaInformation(title, [__appname__])
+        mi.publisher = __appname__
+        mi.author_sort = __appname__
+        mi.publication_type = self.publication_type+':'+self.short_title()
+        mi.timestamp = nowf()
+        mi.comments = self.description
+        if not isinstance(mi.comments, unicode):
+            mi.comments = mi.comments.decode('utf-8', 'replace')
+        mi.pubdate = nowf()
+        opf_path = os.path.join(dir, 'index.opf')
+        ncx_path = os.path.join(dir, 'index.ncx')
+        opf = OPFCreator(dir, mi)
+        # Add mastheadImage entry to <guide> section
+        mp = getattr(self, 'masthead_path', None)
+        if mp is not None and os.access(mp, os.R_OK):
+            from calibre.ebooks.metadata.opf2 import Guide
+            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+            ref.type = 'masthead'
+            ref.title = 'Masthead Image'
+            opf.guide.append(ref)
+
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        manifest.append(os.path.join(dir, 'index.html'))
+        manifest.append(os.path.join(dir, 'index.ncx'))
+
+        # Get cover
+        cpath = getattr(self, 'cover_path', None)
+        if cpath is None:
+            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+            if self.default_cover(pf):
+                cpath =  pf.name
+        if cpath is not None and os.access(cpath, os.R_OK):
+            opf.cover = cpath
+            manifest.append(cpath)
+
+        # Get masthead
+        mpath = getattr(self, 'masthead_path', None)
+        if mpath is not None and os.access(mpath, os.R_OK):
+            manifest.append(mpath)
+
+        opf.create_manifest_from_files_in(manifest)
+        for mani in opf.manifest:
+            if mani.path.endswith('.ncx'):
+                mani.id = 'ncx'
+            if mani.path.endswith('mastheadImage.jpg'):
+                mani.id = 'masthead-image'
+        entries = ['index.html']
+        toc = TOC(base_path=dir)
+        self.play_order_counter = 0
+        self.play_order_map = {}
+
+        def feed_index(num, parent):
+            f = feeds[num]
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(num, j)
+                    auth = a.author
+                    if not auth:
+                        auth = None
+                    desc = a.text_summary
+                    if not desc:
+                        desc = None
+                    else:
+                        desc = self.description_limiter(desc)
+                    entries.append('%sindex.html'%adir)
+                    po = self.play_order_map.get(entries[-1], None)
+                    if po is None:
+                        self.play_order_counter += 1
+                        po = self.play_order_counter
+                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                                    play_order=po, author=auth, description=desc)
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+                    for sp in a.sub_pages:
+                        prefix = os.path.commonprefix([opf_path, sp])
+                        relp = sp[len(prefix):]
+                        entries.append(relp.replace(os.sep, '/'))
+                        last = sp
+
+                    if os.path.exists(last):
+                        with open(last, 'rb') as fi:
+                            src = fi.read().decode('utf-8')
+                        soup = BeautifulSoup(src)
+                        body = soup.find('body')
+                        if body is not None:
+                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+                            templ = self.navbar.generate(True, num, j, len(f),
+                                            not self.has_single_feed,
+                                            a.orig_url, __appname__, prefix=prefix,
+                                            center=self.center_navbar)
+                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+                            body.insert(len(body.contents), elem)
+                            with open(last, 'wb') as fi:
+                                fi.write(unicode(soup).encode('utf-8'))
+        if len(feeds) == 0:
+            raise Exception('All feeds are empty, aborting.')
+
+        if len(feeds) > 1:
+            for i, f in enumerate(feeds):
+                entries.append('feed_%d/index.html'%i)
+                po = self.play_order_map.get(entries[-1], None)
+                if po is None:
+                    self.play_order_counter += 1
+                    po = self.play_order_counter
+                auth = getattr(f, 'author', None)
+                if not auth:
+                    auth = None
+                desc = getattr(f, 'description', None)
+                if not desc:
+                    desc = None
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+                    f.title, play_order=po, description=desc, author=auth))
+
+        else:
+            entries.append('feed_%d/index.html'%0)
+            feed_index(0, toc)
+
+        for i, p in enumerate(entries):
+            entries[i] = os.path.join(dir, p.replace('/', os.sep))
+        opf.create_spine(entries)
+        opf.set_toc(toc)
+
+        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
+            opf.render(opf_file, ncx_file)
+