Add Ming Pao Vancouver and Toronto by Eddie Lau

2025-08-30 23:00:21 -04:00 · 2011-06-26 14:42:16 -06:00 · 2011-06-26 14:42:16 -06:00 · eb0358025d
commit eb0358025d
parent 6747188703
3 changed files with 1414 additions and 108 deletions
--- a/recipes/ming_pao.recipe
+++ b/recipes/ming_pao.recipe
@ -1,17 +1,23 @@
-# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2010-2011, Eddie Lau'

+# Region - Hong Kong, Vancouver, Toronto
+__Region__ = 'Hong Kong'
 # Users of Kindle 3 with limited system-level CJK support
 # please replace the following "True" with "False".
 __MakePeriodical__ = True
 # Turn below to true if your device supports display of CJK titles
 __UseChineseTitle__ = False
-# Trun below to true if you wish to use life.mingpao.com as the main article source
+# Set it to False if you want to skip images
+__KeepImages__ = True
+# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
 __UseLife__ = True

+
 '''
 Change Log:
+2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
+            provide options to remove all images in the file
 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
 2011/03/06: add new articles for finance section, also a new section "Columns"
 2011/02/28: rearrange the sections
@ -34,21 +40,96 @@ Change Log:
 import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
-
-
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation

-class MPHKRecipe(BasicNewsRecipe):
-    title          = 'Ming Pao - Hong Kong'
+# MAIN CLASS
+class MPRecipe(BasicNewsRecipe):
+    if __Region__ == 'Hong Kong':
+        title       = 'Ming Pao - Hong Kong'
+        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+        category    = 'Chinese, News, Hong Kong'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
+        keep_only_tags = [dict(name='h1'),
+                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
+                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
+                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
+                          dict(attrs={'id':['newscontent01','newscontent02']}),
+                          dict(attrs={'class':['photo']}),
+                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='style'),
+                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
+                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
+                           #dict(name='table')  # for content fetched from life.mingpao.com
+                          ]
+        else:
+            remove_tags = [dict(name='style'),
+                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
+                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
+                           dict(name='img'),
+                           #dict(name='table')  # for content fetched from life.mingpao.com
+                          ]
+        remove_attributes = ['width']
+        preprocess_regexps = [
+                              (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
+                              lambda match: '<h1>'),
+                              (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
+                              lambda match: '</h1>'),
+                              (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
+                              lambda match: ''),
+                              # skip <br> after title in life.mingpao.com fetched article
+                              (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
+                              lambda match: "<div id='newscontent'>"),
+                              (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
+                              lambda match: "</b>")
+                             ]
+    elif __Region__ == 'Vancouver':
+        title       = 'Ming Pao - Vancouver'
+        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
+        category    = 'Chinese, News, Vancouver'
+        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
+        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
+                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
+                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
+        else:
+            remove_tags = [dict(name='img')]
+        remove_attributes = ['width']
+        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
+                              lambda match: ''),
+                             ]
+    elif __Region__ == 'Toronto':
+        title       = 'Ming Pao - Toronto'
+        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
+        category    = 'Chinese, News, Toronto'
+        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
+        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
+                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
+                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
+        else:
+            remove_tags = [dict(name='img')]
+        remove_attributes = ['width']
+        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
+                              lambda match: ''),
+                             ]
+
    oldest_article = 1
    max_articles_per_feed = 100
    __author__            = 'Eddie Lau'
-    description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
    publisher             = 'MingPao'
-    category              = 'Chinese, News, Hong Kong'
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
@ -57,33 +138,6 @@ class MPHKRecipe(BasicNewsRecipe):
    recursions = 0
    conversion_options = {'linearize_tables':True}
    timefmt = ''
-    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
-    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
-    keep_only_tags = [dict(name='h1'),
-                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
-                      dict(name='font', attrs={'color':['AA0000']}), # for column articles title
-                      dict(attrs={'id':['newscontent']}), # entertainment and column page content
-                      dict(attrs={'id':['newscontent01','newscontent02']}),
-                      dict(attrs={'class':['photo']}),
-                      dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
-                      ]
-    remove_tags = [dict(name='style'),
-                   dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
-                   dict(name='table')]  # for content fetched from life.mingpao.com
-    remove_attributes = ['width']
-    preprocess_regexps = [
-                          (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
-                          lambda match: '<h1>'),
-                          (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
-                          lambda match: '</h1>'),
-                          (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
-                          lambda match: ''),
-                          # skip <br> after title in life.mingpao.com fetched article
-                          (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
-                          lambda match: "<div id='newscontent'>"),
-                          (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
-                          lambda match: "</b>")
-                         ]

    def image_url_processor(cls, baseurl, url):
        # trick: break the url at the first occurance of digit, add an additional
@ -124,8 +178,18 @@ class MPHKRecipe(BasicNewsRecipe):

    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
-        # convert UTC to local hk time - at around HKT 6.00am, all news are available
-        dt_local = dt_utc - datetime.timedelta(-2.0/24)
+        if __Region__ == 'Hong Kong':
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
+        elif __Region__ == 'Vancouver':
+            # convert UTC to local Vancouver time - at PST time 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24)
+            #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24)
+        elif __Region__ == 'Toronto':
+            # convert UTC to local Toronto time - at EST time 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24)
+            #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24)
        return dt_local

    def get_fetchdate(self):
@ -135,13 +199,15 @@ class MPHKRecipe(BasicNewsRecipe):
        return self.get_dtlocal().strftime("%Y-%m-%d")

    def get_fetchday(self):
-        # dt_utc = datetime.datetime.utcnow()
-        # convert UTC to local hk time - at around HKT 6.00am, all news are available
-        # dt_local = dt_utc - datetime.timedelta(-2.0/24)
        return self.get_dtlocal().strftime("%d")

    def get_cover_url(self):
-        cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
+        if __Region__ == 'Hong Kong':
+            cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
+        elif __Region__ == 'Vancouver':
+            cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
+        elif __Region__ == 'Toronto':
+            cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(cover)
@ -153,76 +219,104 @@ class MPHKRecipe(BasicNewsRecipe):
        feeds = []
        dateStr = self.get_fetchdate()

-        if __UseLife__:
-            for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
-                                       (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
-                                       (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
-                                       (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
-                                       (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
-                                       (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
-                                       (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
-                                       (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
-                                       (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
-                                       (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
-                                       (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
-                articles = self.parse_section2(url, keystr)
+        if __Region__ == 'Hong Kong':
+            if __UseLife__:
+                for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
+                                           (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
+                                           (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
+                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
+                                           (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
+                                           (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
+                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
+                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
+                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
+                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
+                                           (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
+                    articles = self.parse_section2(url, keystr)
+                    if articles:
+                        feeds.append((title, articles))
+
+                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+            else:
+                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special- editorial
+                ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                if ed_articles:
+                    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+
+                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+                                   (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special - finance
+                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+                fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                if fin_articles:
+                    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+
+                for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special - entertainment
+                ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                if ent_articles:
+                    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+
+                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+
+                # special- columns
+                col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
+                if col_articles:
+                    feeds.append((u'\u5c08\u6b04 Columns', col_articles))
+        elif __Region__ == 'Vancouver':
+            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
+                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
+                               (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
+                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
+                               (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
+                               (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
+                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
+                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
+                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
+                               (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
+                articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
                if articles:
                    feeds.append((title, articles))
-
-            for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                               (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-                articles = self.parse_section(url)
+        elif __Region__ == 'Toronto':
+            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
+                               (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
+                               (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
+                               (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
+                               (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
+                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
+                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
+                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
+                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
+                               (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
+                articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
                if articles:
                    feeds.append((title, articles))
-        else:
-            for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
-                               (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                               (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
-                articles = self.parse_section(url)
-                if articles:
-                    feeds.append((title, articles))
-
-            # special- editorial
-            ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-            if ed_articles:
-                feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
-
-            for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
-                               (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
-                articles = self.parse_section(url)
-                if articles:
-                    feeds.append((title, articles))
-
-            # special - finance
-            #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-            fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-            if fin_articles:
-                feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-
-            for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-                               (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
-                articles = self.parse_section(url)
-                if articles:
-                    feeds.append((title, articles))
-
-            # special - entertainment
-            ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-            if ent_articles:
-                feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
-
-            for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                               (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-                articles = self.parse_section(url)
-                if articles:
-                    feeds.append((title, articles))
-
-
-            # special- columns
-            col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
-            if col_articles:
-                feeds.append((u'\u5c08\u6b04 Columns', col_articles))
-
        return feeds

    # parse from news.mingpao.com
@ -256,11 +350,30 @@ class MPHKRecipe(BasicNewsRecipe):
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles

+    # parse from www.mingpaovan.com
+    def parse_section3(self, url, baseUrl):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
+        current_articles = []
+        included_urls = []
+        divs.reverse()
+        for i in divs:
+            title = self.tag_to_string(i)
+            urlstr = i.get('href', False)
+            urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
+            if urlstr not in included_urls:
+                current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
+                included_urls.append(urlstr)
+        current_articles.reverse()
+        return current_articles
+
    def parse_ed_section(self, url):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
@ -338,7 +451,12 @@ class MPHKRecipe(BasicNewsRecipe):
        if dir is None:
            dir = self.output_dir
        if __UseChineseTitle__ == True:
-            title = u'\u660e\u5831 (\u9999\u6e2f)'
+            if __Region__ == 'Hong Kong':
+                title = u'\u660e\u5831 (\u9999\u6e2f)'
+            elif __Region__ == 'Vancouver':
+                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
+            elif __Region__ == 'Toronto':
+                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title = self.short_title()
        # if not generating a periodical, force date to apply in title
--- a/recipes/ming_pao_toronto.recipe
+++ b/recipes/ming_pao_toronto.recipe
@ -0,0 +1,594 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010-2011, Eddie Lau'
+
+# Region - Hong Kong, Vancouver, Toronto
+__Region__ = 'Toronto'
+# Users of Kindle 3 with limited system-level CJK support
+# please replace the following "True" with "False".
+__MakePeriodical__ = True
+# Turn below to true if your device supports display of CJK titles
+__UseChineseTitle__ = False
+# Set it to False if you want to skip images
+__KeepImages__ = True
+# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
+__UseLife__ = True
+
+
+'''
+Change Log:
+2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
+            provide options to remove all images in the file
+2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
+2011/03/06: add new articles for finance section, also a new section "Columns"
+2011/02/28: rearrange the sections
+            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
+            View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
+            folder in Kindle 3
+2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
+            clean up the indentation
+2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
+            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
+2010/11/22: add English section, remove eco-news section which is not updated daily, correct
+            ordering of articles
+2010/11/12: add news image and eco-news section
+2010/11/08: add parsing of finance section
+2010/11/06: temporary work-around for Kindle device having no capability to display unicode
+            in section/article list.
+2010/10/31: skip repeated articles in section pages
+'''
+
+import os, datetime, re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from contextlib import nested
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.metadata import MetaInformation
+
+# MAIN CLASS
+class MPRecipe(BasicNewsRecipe):
+    if __Region__ == 'Hong Kong':
+        title       = 'Ming Pao - Hong Kong'
+        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+        category    = 'Chinese, News, Hong Kong'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
+        keep_only_tags = [dict(name='h1'),
+                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
+                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
+                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
+                          dict(attrs={'id':['newscontent01','newscontent02']}),
+                          dict(attrs={'class':['photo']}),
+                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='style'),
+                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
+                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
+                           #dict(name='table')  # for content fetched from life.mingpao.com
+                          ]
+        else:
+            remove_tags = [dict(name='style'),
+                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
+                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
+                           dict(name='img'),
+                           #dict(name='table')  # for content fetched from life.mingpao.com
+                          ]
+        remove_attributes = ['width']
+        preprocess_regexps = [
+                              (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
+                              lambda match: '<h1>'),
+                              (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
+                              lambda match: '</h1>'),
+                              (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
+                              lambda match: ''),
+                              # skip <br> after title in life.mingpao.com fetched article
+                              (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
+                              lambda match: "<div id='newscontent'>"),
+                              (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
+                              lambda match: "</b>")
+                             ]
+    elif __Region__ == 'Vancouver':
+        title       = 'Ming Pao - Vancouver'
+        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
+        category    = 'Chinese, News, Vancouver'
+        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
+        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
+                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
+                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
+        else:
+            remove_tags = [dict(name='img')]
+        remove_attributes = ['width']
+        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
+                              lambda match: ''),
+                             ]
+    elif __Region__ == 'Toronto':
+        title       = 'Ming Pao - Toronto'
+        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
+        category    = 'Chinese, News, Toronto'
+        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
+        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
+                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
+                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
+        else:
+            remove_tags = [dict(name='img')]
+        remove_attributes = ['width']
+        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
+                              lambda match: ''),
+                             ]
+
+    oldest_article = 1
+    max_articles_per_feed = 100
+    __author__            = 'Eddie Lau'
+    publisher             = 'MingPao'
+    remove_javascript = True
+    use_embedded_content   = False
+    no_stylesheets = True
+    language = 'zh'
+    encoding = 'Big5-HKSCS'
+    recursions = 0
+    conversion_options = {'linearize_tables':True}
+    timefmt = ''
+
+    def image_url_processor(cls, baseurl, url):
+        # trick: break the url at the first occurance of digit, add an additional
+        # '_' at the front
+        # not working, may need to move this to preprocess_html() method
+#        minIdx = 10000
+#        i0 = url.find('0')
+#        if i0 >= 0 and i0 < minIdx:
+#           minIdx = i0
+#        i1 = url.find('1')
+#        if i1 >= 0 and i1 < minIdx:
+#           minIdx = i1
+#        i2 = url.find('2')
+#        if i2 >= 0 and i2 < minIdx:
+#           minIdx = i2
+#        i3 = url.find('3')
+#        if i3 >= 0 and i0 < minIdx:
+#           minIdx = i3
+#        i4 = url.find('4')
+#        if i4 >= 0 and i4 < minIdx:
+#           minIdx = i4
+#        i5 = url.find('5')
+#        if i5 >= 0 and i5 < minIdx:
+#           minIdx = i5
+#        i6 = url.find('6')
+#        if i6 >= 0 and i6 < minIdx:
+#           minIdx = i6
+#        i7 = url.find('7')
+#        if i7 >= 0 and i7 < minIdx:
+#           minIdx = i7
+#        i8 = url.find('8')
+#        if i8 >= 0 and i8 < minIdx:
+#           minIdx = i8
+#        i9 = url.find('9')
+#        if i9 >= 0 and i9 < minIdx:
+#           minIdx = i9
+        return url
+
+    def get_dtlocal(self):
+        dt_utc = datetime.datetime.utcnow()
+        if __Region__ == 'Hong Kong':
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
+        elif __Region__ == 'Vancouver':
+            # convert UTC to local Vancouver time - at PST time 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24)
+            #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24)
+        elif __Region__ == 'Toronto':
+            # convert UTC to local Toronto time - at EST time 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24)
+            #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24)
+        return dt_local
+
+    def get_fetchdate(self):
+        return self.get_dtlocal().strftime("%Y%m%d")
+
+    def get_fetchformatteddate(self):
+        return self.get_dtlocal().strftime("%Y-%m-%d")
+
+    def get_fetchday(self):
+        return self.get_dtlocal().strftime("%d")
+
+    def get_cover_url(self):
+        if __Region__ == 'Hong Kong':
+            cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
+        elif __Region__ == 'Vancouver':
+            cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
+        elif __Region__ == 'Toronto':
+            cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
+        br = BasicNewsRecipe.get_browser()
+        try:
+            br.open(cover)
+        except:
+            cover = None
+        return cover
+
+    def parse_index(self):
+        feeds = []
+        dateStr = self.get_fetchdate()
+
+        if __Region__ == 'Hong Kong':
+            if __UseLife__:
+                for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
+                                           (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
+                                           (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
+                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
+                                           (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
+                                           (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
+                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
+                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
+                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
+                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
+                                           (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
+                    articles = self.parse_section2(url, keystr)
+                    if articles:
+                        feeds.append((title, articles))
+
+                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+            else:
+                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special- editorial
+                ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                if ed_articles:
+                    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+
+                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+                                   (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special - finance
+                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+                fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                if fin_articles:
+                    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+
+                for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special - entertainment
+                ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                if ent_articles:
+                    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+
+                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+
+                # special- columns
+                col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
+                if col_articles:
+                    feeds.append((u'\u5c08\u6b04 Columns', col_articles))
+        elif __Region__ == 'Vancouver':
+            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
+                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
+                               (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
+                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
+                               (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
+                               (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
+                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
+                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
+                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
+                               (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
+                articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
+                if articles:
+                    feeds.append((title, articles))
+        elif __Region__ == 'Toronto':
+            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
+                               (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
+                               (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
+                               (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
+                               (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
+                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
+                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
+                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
+                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
+                               (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
+                articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
+                if articles:
+                    feeds.append((title, articles))
+        return feeds
+
+    # parse from news.mingpao.com
+    def parse_section(self, url):
+        dateStr = self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+        current_articles = []
+        included_urls = []
+        divs.reverse()
+        for i in divs:
+            a = i.find('a', href = True)
+            title = self.tag_to_string(a)
+            url = a.get('href', False)
+            url = 'http://news.mingpao.com/' + dateStr + '/' +url
+            if url not in included_urls and url.rfind('Redirect') == -1:
+                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    # parse from life.mingpao.com
+    def parse_section2(self, url, keystr):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    # parse from www.mingpaovan.com
+    def parse_section3(self, url, baseUrl):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
+        current_articles = []
+        included_urls = []
+        divs.reverse()
+        for i in divs:
+            title = self.tag_to_string(i)
+            urlstr = i.get('href', False)
+            urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
+            if urlstr not in included_urls:
+                current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
+                included_urls.append(urlstr)
+        current_articles.reverse()
+        return current_articles
+
+    def parse_ed_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    def parse_fin_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href= True)
+        current_articles = []
+        included_urls = []
+        for i in a:
+            #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+            if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
+                title = self.tag_to_string(i)
+                current_articles.append({'title': title, 'url': url, 'description':''})
+                included_urls.append(url)
+        return current_articles
+
+    def parse_ent_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    def parse_col_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        for item in soup.findAll(style=True):
+            del item['width']
+        for item in soup.findAll(stype=True):
+            del item['absmiddle']
+        return soup
+
+    def create_opf(self, feeds, dir=None):
+        if dir is None:
+            dir = self.output_dir
+        if __UseChineseTitle__ == True:
+            if __Region__ == 'Hong Kong':
+                title = u'\u660e\u5831 (\u9999\u6e2f)'
+            elif __Region__ == 'Vancouver':
+                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
+            elif __Region__ == 'Toronto':
+                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
+        else:
+            title = self.short_title()
+        # if not generating a periodical, force date to apply in title
+        if __MakePeriodical__ == False:
+            title = title + ' ' + self.get_fetchformatteddate()
+        if True:
+            mi = MetaInformation(title, [self.publisher])
+            mi.publisher = self.publisher
+            mi.author_sort = self.publisher
+            if __MakePeriodical__ == True:
+                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+            else:
+                mi.publication_type = self.publication_type+':'+self.short_title()
+            #mi.timestamp = nowf()
+            mi.timestamp = self.get_dtlocal()
+            mi.comments = self.description
+            if not isinstance(mi.comments, unicode):
+                mi.comments = mi.comments.decode('utf-8', 'replace')
+            #mi.pubdate = nowf()
+            mi.pubdate = self.get_dtlocal()
+            opf_path = os.path.join(dir, 'index.opf')
+            ncx_path = os.path.join(dir, 'index.ncx')
+            opf = OPFCreator(dir, mi)
+            # Add mastheadImage entry to <guide> section
+            mp = getattr(self, 'masthead_path', None)
+            if mp is not None and os.access(mp, os.R_OK):
+                from calibre.ebooks.metadata.opf2 import Guide
+                ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+                ref.type = 'masthead'
+                ref.title = 'Masthead Image'
+                opf.guide.append(ref)
+
+            manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+            manifest.append(os.path.join(dir, 'index.html'))
+            manifest.append(os.path.join(dir, 'index.ncx'))
+
+            # Get cover
+            cpath = getattr(self, 'cover_path', None)
+            if cpath is None:
+                pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+                if self.default_cover(pf):
+                    cpath =  pf.name
+            if cpath is not None and os.access(cpath, os.R_OK):
+                opf.cover = cpath
+                manifest.append(cpath)
+
+            # Get masthead
+            mpath = getattr(self, 'masthead_path', None)
+            if mpath is not None and os.access(mpath, os.R_OK):
+                manifest.append(mpath)
+
+            opf.create_manifest_from_files_in(manifest)
+            for mani in opf.manifest:
+                if mani.path.endswith('.ncx'):
+                    mani.id = 'ncx'
+                if mani.path.endswith('mastheadImage.jpg'):
+                    mani.id = 'masthead-image'
+            entries = ['index.html']
+            toc = TOC(base_path=dir)
+            self.play_order_counter = 0
+            self.play_order_map = {}
+
+        def feed_index(num, parent):
+            f = feeds[num]
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(num, j)
+                    auth = a.author
+                    if not auth:
+                        auth = None
+                    desc = a.text_summary
+                    if not desc:
+                        desc = None
+                    else:
+                        desc = self.description_limiter(desc)
+                    entries.append('%sindex.html'%adir)
+                    po = self.play_order_map.get(entries[-1], None)
+                    if po is None:
+                        self.play_order_counter += 1
+                        po = self.play_order_counter
+                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                                    play_order=po, author=auth, description=desc)
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+                    for sp in a.sub_pages:
+                        prefix = os.path.commonprefix([opf_path, sp])
+                        relp = sp[len(prefix):]
+                        entries.append(relp.replace(os.sep, '/'))
+                        last = sp
+
+                    if os.path.exists(last):
+                        with open(last, 'rb') as fi:
+                            src = fi.read().decode('utf-8')
+                        soup = BeautifulSoup(src)
+                        body = soup.find('body')
+                        if body is not None:
+                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+                            templ = self.navbar.generate(True, num, j, len(f),
+                                            not self.has_single_feed,
+                                            a.orig_url, self.publisher, prefix=prefix,
+                                            center=self.center_navbar)
+                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+                            body.insert(len(body.contents), elem)
+                            with open(last, 'wb') as fi:
+                                fi.write(unicode(soup).encode('utf-8'))
+        if len(feeds) == 0:
+            raise Exception('All feeds are empty, aborting.')
+
+        if len(feeds) > 1:
+            for i, f in enumerate(feeds):
+                entries.append('feed_%d/index.html'%i)
+                po = self.play_order_map.get(entries[-1], None)
+                if po is None:
+                    self.play_order_counter += 1
+                    po = self.play_order_counter
+                auth = getattr(f, 'author', None)
+                if not auth:
+                    auth = None
+                desc = getattr(f, 'description', None)
+                if not desc:
+                    desc = None
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+                           f.title, play_order=po, description=desc, author=auth))
+
+        else:
+            entries.append('feed_%d/index.html'%0)
+            feed_index(0, toc)
+
+        for i, p in enumerate(entries):
+            entries[i] = os.path.join(dir, p.replace('/', os.sep))
+        opf.create_spine(entries)
+        opf.set_toc(toc)
+
+        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
+            opf.render(opf_file, ncx_file)
+
--- a/recipes/ming_pao_vancouver.recipe
+++ b/recipes/ming_pao_vancouver.recipe
@ -0,0 +1,594 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010-2011, Eddie Lau'
+
+# Region - Hong Kong, Vancouver, Toronto
+__Region__ = 'Vancouver'
+# Users of Kindle 3 with limited system-level CJK support
+# please replace the following "True" with "False".
+__MakePeriodical__ = True
+# Turn below to true if your device supports display of CJK titles
+__UseChineseTitle__ = False
+# Set it to False if you want to skip images
+__KeepImages__ = True
+# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
+__UseLife__ = True
+
+
+'''
+Change Log:
+2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
+            provide options to remove all images in the file
+2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
+2011/03/06: add new articles for finance section, also a new section "Columns"
+2011/02/28: rearrange the sections
+            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
+            View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
+            folder in Kindle 3
+2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
+            clean up the indentation
+2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
+            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
+2010/11/22: add English section, remove eco-news section which is not updated daily, correct
+            ordering of articles
+2010/11/12: add news image and eco-news section
+2010/11/08: add parsing of finance section
+2010/11/06: temporary work-around for Kindle device having no capability to display unicode
+            in section/article list.
+2010/10/31: skip repeated articles in section pages
+'''
+
+import os, datetime, re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from contextlib import nested
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.metadata import MetaInformation
+
+# MAIN CLASS
+class MPRecipe(BasicNewsRecipe):
+    if __Region__ == 'Hong Kong':
+        title       = 'Ming Pao - Hong Kong'
+        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
+        category    = 'Chinese, News, Hong Kong'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
+        keep_only_tags = [dict(name='h1'),
+                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
+                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
+                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
+                          dict(attrs={'id':['newscontent01','newscontent02']}),
+                          dict(attrs={'class':['photo']}),
+                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='style'),
+                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
+                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
+                           #dict(name='table')  # for content fetched from life.mingpao.com
+                          ]
+        else:
+            remove_tags = [dict(name='style'),
+                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
+                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
+                           dict(name='img'),
+                           #dict(name='table')  # for content fetched from life.mingpao.com
+                          ]
+        remove_attributes = ['width']
+        preprocess_regexps = [
+                              (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
+                              lambda match: '<h1>'),
+                              (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
+                              lambda match: '</h1>'),
+                              (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
+                              lambda match: ''),
+                              # skip <br> after title in life.mingpao.com fetched article
+                              (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
+                              lambda match: "<div id='newscontent'>"),
+                              (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
+                              lambda match: "</b>")
+                             ]
+    elif __Region__ == 'Vancouver':
+        title       = 'Ming Pao - Vancouver'
+        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
+        category    = 'Chinese, News, Vancouver'
+        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
+        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
+                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
+                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
+        else:
+            remove_tags = [dict(name='img')]
+        remove_attributes = ['width']
+        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
+                              lambda match: ''),
+                             ]
+    elif __Region__ == 'Toronto':
+        title       = 'Ming Pao - Toronto'
+        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
+        category    = 'Chinese, News, Toronto'
+        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
+        masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
+        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
+                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
+                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
+                          ]
+        if __KeepImages__:
+            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
+        else:
+            remove_tags = [dict(name='img')]
+        remove_attributes = ['width']
+        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
+                              lambda match: ''),
+                             ]
+
+    oldest_article = 1
+    max_articles_per_feed = 100
+    __author__            = 'Eddie Lau'
+    publisher             = 'MingPao'
+    remove_javascript = True
+    use_embedded_content   = False
+    no_stylesheets = True
+    language = 'zh'
+    encoding = 'Big5-HKSCS'
+    recursions = 0
+    conversion_options = {'linearize_tables':True}
+    timefmt = ''
+
+    def image_url_processor(cls, baseurl, url):
+        # trick: break the url at the first occurance of digit, add an additional
+        # '_' at the front
+        # not working, may need to move this to preprocess_html() method
+#        minIdx = 10000
+#        i0 = url.find('0')
+#        if i0 >= 0 and i0 < minIdx:
+#           minIdx = i0
+#        i1 = url.find('1')
+#        if i1 >= 0 and i1 < minIdx:
+#           minIdx = i1
+#        i2 = url.find('2')
+#        if i2 >= 0 and i2 < minIdx:
+#           minIdx = i2
+#        i3 = url.find('3')
+#        if i3 >= 0 and i0 < minIdx:
+#           minIdx = i3
+#        i4 = url.find('4')
+#        if i4 >= 0 and i4 < minIdx:
+#           minIdx = i4
+#        i5 = url.find('5')
+#        if i5 >= 0 and i5 < minIdx:
+#           minIdx = i5
+#        i6 = url.find('6')
+#        if i6 >= 0 and i6 < minIdx:
+#           minIdx = i6
+#        i7 = url.find('7')
+#        if i7 >= 0 and i7 < minIdx:
+#           minIdx = i7
+#        i8 = url.find('8')
+#        if i8 >= 0 and i8 < minIdx:
+#           minIdx = i8
+#        i9 = url.find('9')
+#        if i9 >= 0 and i9 < minIdx:
+#           minIdx = i9
+        return url
+
+    def get_dtlocal(self):
+        dt_utc = datetime.datetime.utcnow()
+        if __Region__ == 'Hong Kong':
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
+        elif __Region__ == 'Vancouver':
+            # convert UTC to local Vancouver time - at PST time 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(4.5/24)
+            #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(4.5/24)
+        elif __Region__ == 'Toronto':
+            # convert UTC to local Toronto time - at EST time 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(4.5/24)
+            #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(4.5/24)
+        return dt_local
+
+    def get_fetchdate(self):
+        return self.get_dtlocal().strftime("%Y%m%d")
+
+    def get_fetchformatteddate(self):
+        return self.get_dtlocal().strftime("%Y-%m-%d")
+
+    def get_fetchday(self):
+        return self.get_dtlocal().strftime("%d")
+
+    def get_cover_url(self):
+        if __Region__ == 'Hong Kong':
+            cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
+        elif __Region__ == 'Vancouver':
+            cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
+        elif __Region__ == 'Toronto':
+            cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
+        br = BasicNewsRecipe.get_browser()
+        try:
+            br.open(cover)
+        except:
+            cover = None
+        return cover
+
+    def parse_index(self):
+        feeds = []
+        dateStr = self.get_fetchdate()
+
+        if __Region__ == 'Hong Kong':
+            if __UseLife__:
+                for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
+                                           (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
+                                           (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
+                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
+                                           (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
+                                           (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
+                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
+                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
+                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
+                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
+                                           (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
+                    articles = self.parse_section2(url, keystr)
+                    if articles:
+                        feeds.append((title, articles))
+
+                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+            else:
+                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
+                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
+                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special- editorial
+                ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                if ed_articles:
+                    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+
+                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
+                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
+                                   (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special - finance
+                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
+                fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                if fin_articles:
+                    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+
+                for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+                # special - entertainment
+                ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                if ent_articles:
+                    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+
+                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
+                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
+                    articles = self.parse_section(url)
+                    if articles:
+                        feeds.append((title, articles))
+
+
+                # special- columns
+                col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
+                if col_articles:
+                    feeds.append((u'\u5c08\u6b04 Columns', col_articles))
+        elif __Region__ == 'Vancouver':
+            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
+                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
+                               (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
+                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
+                               (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
+                               (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
+                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
+                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
+                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
+                               (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
+                articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
+                if articles:
+                    feeds.append((title, articles))
+        elif __Region__ == 'Toronto':
+            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
+                               (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
+                               (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
+                               (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
+                               (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
+                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
+                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
+                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
+                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
+                               (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
+                articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
+                if articles:
+                    feeds.append((title, articles))
+        return feeds
+
+    # parse from news.mingpao.com
+    def parse_section(self, url):
+        dateStr = self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
+        current_articles = []
+        included_urls = []
+        divs.reverse()
+        for i in divs:
+            a = i.find('a', href = True)
+            title = self.tag_to_string(a)
+            url = a.get('href', False)
+            url = 'http://news.mingpao.com/' + dateStr + '/' +url
+            if url not in included_urls and url.rfind('Redirect') == -1:
+                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    # parse from life.mingpao.com
+    def parse_section2(self, url, keystr):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    # parse from www.mingpaovan.com
+    def parse_section3(self, url, baseUrl):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
+        current_articles = []
+        included_urls = []
+        divs.reverse()
+        for i in divs:
+            title = self.tag_to_string(i)
+            urlstr = i.get('href', False)
+            urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
+            if urlstr not in included_urls:
+                current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
+                included_urls.append(urlstr)
+        current_articles.reverse()
+        return current_articles
+
+    def parse_ed_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    def parse_fin_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href= True)
+        current_articles = []
+        included_urls = []
+        for i in a:
+            #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
+            if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
+                title = self.tag_to_string(i)
+                current_articles.append({'title': title, 'url': url, 'description':''})
+                included_urls.append(url)
+        return current_articles
+
+    def parse_ent_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    def parse_col_section(self, url):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        for item in soup.findAll(style=True):
+            del item['width']
+        for item in soup.findAll(stype=True):
+            del item['absmiddle']
+        return soup
+
+    def create_opf(self, feeds, dir=None):
+        if dir is None:
+            dir = self.output_dir
+        if __UseChineseTitle__ == True:
+            if __Region__ == 'Hong Kong':
+                title = u'\u660e\u5831 (\u9999\u6e2f)'
+            elif __Region__ == 'Vancouver':
+                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
+            elif __Region__ == 'Toronto':
+                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
+        else:
+            title = self.short_title()
+        # if not generating a periodical, force date to apply in title
+        if __MakePeriodical__ == False:
+            title = title + ' ' + self.get_fetchformatteddate()
+        if True:
+            mi = MetaInformation(title, [self.publisher])
+            mi.publisher = self.publisher
+            mi.author_sort = self.publisher
+            if __MakePeriodical__ == True:
+                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+            else:
+                mi.publication_type = self.publication_type+':'+self.short_title()
+            #mi.timestamp = nowf()
+            mi.timestamp = self.get_dtlocal()
+            mi.comments = self.description
+            if not isinstance(mi.comments, unicode):
+                mi.comments = mi.comments.decode('utf-8', 'replace')
+            #mi.pubdate = nowf()
+            mi.pubdate = self.get_dtlocal()
+            opf_path = os.path.join(dir, 'index.opf')
+            ncx_path = os.path.join(dir, 'index.ncx')
+            opf = OPFCreator(dir, mi)
+            # Add mastheadImage entry to <guide> section
+            mp = getattr(self, 'masthead_path', None)
+            if mp is not None and os.access(mp, os.R_OK):
+                from calibre.ebooks.metadata.opf2 import Guide
+                ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+                ref.type = 'masthead'
+                ref.title = 'Masthead Image'
+                opf.guide.append(ref)
+
+            manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+            manifest.append(os.path.join(dir, 'index.html'))
+            manifest.append(os.path.join(dir, 'index.ncx'))
+
+            # Get cover
+            cpath = getattr(self, 'cover_path', None)
+            if cpath is None:
+                pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+                if self.default_cover(pf):
+                    cpath =  pf.name
+            if cpath is not None and os.access(cpath, os.R_OK):
+                opf.cover = cpath
+                manifest.append(cpath)
+
+            # Get masthead
+            mpath = getattr(self, 'masthead_path', None)
+            if mpath is not None and os.access(mpath, os.R_OK):
+                manifest.append(mpath)
+
+            opf.create_manifest_from_files_in(manifest)
+            for mani in opf.manifest:
+                if mani.path.endswith('.ncx'):
+                    mani.id = 'ncx'
+                if mani.path.endswith('mastheadImage.jpg'):
+                    mani.id = 'masthead-image'
+            entries = ['index.html']
+            toc = TOC(base_path=dir)
+            self.play_order_counter = 0
+            self.play_order_map = {}
+
+        def feed_index(num, parent):
+            f = feeds[num]
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(num, j)
+                    auth = a.author
+                    if not auth:
+                        auth = None
+                    desc = a.text_summary
+                    if not desc:
+                        desc = None
+                    else:
+                        desc = self.description_limiter(desc)
+                    entries.append('%sindex.html'%adir)
+                    po = self.play_order_map.get(entries[-1], None)
+                    if po is None:
+                        self.play_order_counter += 1
+                        po = self.play_order_counter
+                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                                    play_order=po, author=auth, description=desc)
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+                    for sp in a.sub_pages:
+                        prefix = os.path.commonprefix([opf_path, sp])
+                        relp = sp[len(prefix):]
+                        entries.append(relp.replace(os.sep, '/'))
+                        last = sp
+
+                    if os.path.exists(last):
+                        with open(last, 'rb') as fi:
+                            src = fi.read().decode('utf-8')
+                        soup = BeautifulSoup(src)
+                        body = soup.find('body')
+                        if body is not None:
+                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+                            templ = self.navbar.generate(True, num, j, len(f),
+                                            not self.has_single_feed,
+                                            a.orig_url, self.publisher, prefix=prefix,
+                                            center=self.center_navbar)
+                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+                            body.insert(len(body.contents), elem)
+                            with open(last, 'wb') as fi:
+                                fi.write(unicode(soup).encode('utf-8'))
+        if len(feeds) == 0:
+            raise Exception('All feeds are empty, aborting.')
+
+        if len(feeds) > 1:
+            for i, f in enumerate(feeds):
+                entries.append('feed_%d/index.html'%i)
+                po = self.play_order_map.get(entries[-1], None)
+                if po is None:
+                    self.play_order_counter += 1
+                    po = self.play_order_counter
+                auth = getattr(f, 'author', None)
+                if not auth:
+                    auth = None
+                desc = getattr(f, 'description', None)
+                if not desc:
+                    desc = None
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+                           f.title, play_order=po, description=desc, author=auth))
+
+        else:
+            entries.append('feed_%d/index.html'%0)
+            feed_index(0, toc)
+
+        for i, p in enumerate(entries):
+            entries[i] = os.path.join(dir, p.replace('/', os.sep))
+        opf.create_spine(entries)
+        opf.set_toc(toc)
+
+        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
+            opf.render(opf_file, ncx_file)
+