Update Ming Pao

2025-08-11 09:13:57 -04:00 · 2011-10-19 06:06:36 +05:30 · 2011-10-19 06:06:36 +05:30 · 68a29c213d
commit 68a29c213d
parent f31f109c23
1 changed files with 67 additions and 45 deletions
--- a/recipes/ming_pao.recipe
+++ b/recipes/ming_pao.recipe
@ -4,24 +4,25 @@ __copyright__ = '2010-2011, Eddie Lau'
 # Region - Hong Kong, Vancouver, Toronto
 __Region__ = 'Hong Kong'
 # Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False".
+# please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
-# Turn below to True if your device supports display of CJK titles
+# Turn below to True if your device supports display of CJK titles (Default: False)
 __UseChineseTitle__ = False
-# Set it to False if you want to skip images
+# Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
-# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source
+# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
 __UseLife__ = True
-# (HK only) It is to disable the column section which is now a premium content
-__InclCols__ = False
-# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats
-__ParsePFF__ = False
-# (HK only) Turn below to True if you wish hi-res images
+# (HK only) It is to disable premium content (Default: False)
+__InclPremium__ = False
+# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
+__ParsePFF__ = True
+# (HK only) Turn below to True if you wish hi-res images (Default: False)
 __HiResImg__ = False


 '''
 Change Log:
+2011/10/17: disable fetching of premium content, also improved txt source parsing
 2011/10/04: option to get hi-res photos for the articles
 2011/09/21: fetching "column" section is made optional. 
 2011/09/18: parse "column" section stuff from source text file directly.
@ -72,7 +73,7 @@ class MPRecipe(BasicNewsRecipe):
                          dict(attrs={'class':['content']}),  # for content from txt
                          dict(attrs={'class':['photo']}),
                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
                          dict(attrs={'class':['images']})   # for images from txt
                          ]
        if __KeepImages__:
@ -208,11 +209,14 @@ class MPRecipe(BasicNewsRecipe):
                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
                                          ]:
-                    articles = self.parse_section2(url, keystr)
+                    if __InclPremium__ == True:
+                        articles = self.parse_section2_txt(url, keystr)
+                    else:
+                        articles = self.parse_section2(url, keystr)
                    if articles:
                        feeds.append((title, articles))

-                if __InclCols__ == True:
+                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
@ -253,7 +257,7 @@ class MPRecipe(BasicNewsRecipe):
                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))

                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
-                    articles = self.parse_section2(url, keystr)
+                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))
                        
@ -270,11 +274,11 @@ class MPRecipe(BasicNewsRecipe):

                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
                                          ]:
-                    articles = self.parse_section2(url, keystr)
+                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))
                        
-                if __InclCols__ == True:
+                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
@ -333,7 +337,7 @@ class MPRecipe(BasicNewsRecipe):
            url = 'http://news.mingpao.com/' + dateStr + '/' +url
            # replace the url to the print-friendly version
            if __ParsePFF__ == True:
-                if url.rfind('Redirect') <> -1:
+                if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
                    url = re.sub('%2F.*%2F', '/', url)
                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
@ -349,6 +353,8 @@ class MPRecipe(BasicNewsRecipe):

    # parse from life.mingpao.com
    def parse_section2(self, url, keystr):
+        br = mechanize.Browser()
+        br.set_handle_redirect(False)
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
@ -359,9 +365,13 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
-                current_articles.append({'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
+                try: 
+                    br.open_novisit(url)
+                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
+                    current_articles.append({'title': title, 'url': url, 'description': ''})
+                    included_urls.append(url)
+                except:
+				    print 'skipping a premium article'
        current_articles.reverse()
        return current_articles

@ -553,6 +563,7 @@ class MPRecipe(BasicNewsRecipe):
                # .txt based file
                splitter = re.compile(r'\n') # Match non-digits
                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
+                next_is_mov_link = False
                next_is_img_txt = False
                title_started = False
                met_article_start_char = False
@ -561,22 +572,33 @@ class MPRecipe(BasicNewsRecipe):
                        met_article_start_char = True
                        new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    else:
-                        if next_is_img_txt == False:
-                            if item.startswith('='):
+                        if next_is_img_txt == False and next_is_mov_link == False:
+                            item = item.strip()
+                            if item.startswith("=@"):
+                                next_is_mov_link = True
+                            elif item.startswith("=?"):
+                                next_is_img_txt = True
+                                new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
+                            elif item.startswith('='):
                                next_is_img_txt = True
                                new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
                            else:
-                                if met_article_start_char == False:
-                                    if title_started == False:
-                                        new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
-                                        title_started = True
+                                if item <> '': 
+                                    if next_is_img_txt == False and met_article_start_char == False:
+                                        if title_started == False:
+                                            #print 'Title started at ', item
+                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
+                                            title_started = True
+                                        else:
+                                            new_raw_html = new_raw_html + item + '\n'
                                    else:
-                                        new_raw_html = new_raw_html + item + '\n'
-                                else:
-                                    new_raw_html = new_raw_html + item + '<p>\n'
+                                        new_raw_html = new_raw_html + item + '<p>\n'
                        else:
-                            next_is_img_txt = False
-                            new_raw_html = new_raw_html + item + '\n'
+                            if next_is_mov_link == True:
+                                next_is_mov_link = False
+                            else: 
+                                next_is_img_txt = False
+                                new_raw_html = new_raw_html + item + '\n'
                return new_raw_html + '</div></body></html>'
            
    def preprocess_html(self, soup):
@ -678,7 +700,7 @@ class MPRecipe(BasicNewsRecipe):
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'),
+                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
                                    play_order=po, author=auth, description=desc)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages: