Updated Ming Pao

2025-08-11 09:13:57 -04:00 · 2011-12-20 09:19:38 +05:30 · 2011-12-20 09:19:38 +05:30 · ff6dd9c16a
commit ff6dd9c16a
parent cef64ff0e7
3 changed files with 1106 additions and 359 deletions
--- a/recipes/ming_pao.recipe
+++ b/recipes/ming_pao.recipe
@ -10,6 +10,10 @@ __MakePeriodical__ = True
 __UseChineseTitle__ = False
 # Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
 # Set it to True if you want to include a summary in Kindle's article view (Default: False)
 __IncludeSummary__ = False
 # Set it to True if you want thumbnail images in Kindle's article view (Default: True)
 __IncludeThumbnails__ = True
 # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
 __UseLife__ = True
 # (HK only) It is to disable premium content (Default: False)
@ -24,6 +28,9 @@ __Date__ = ''
 '''
 Change Log:
 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day 
            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
 2011/10/19: fix a bug in txt source parsing
@ -53,6 +60,8 @@ Change Log:
 2010/10/31: skip repeated articles in section pages
 '''
 from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
 from calibre.utils.date import now as nowf
 import os, datetime, re, mechanize
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
@ -60,11 +69,15 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.localization import canonicalize_lang
 # MAIN CLASS
 class MPRecipe(BasicNewsRecipe):
    if __Region__ == 'Hong Kong':
-        title       = 'Ming Pao - Hong Kong'
+        if __UseChineseTitle__ == True:
            title = u'\u660e\u5831 (\u9999\u6e2f)'
        else:
            title   = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
@ -109,7 +122,10 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: "</b>")
                             ]
    elif __Region__ == 'Vancouver':
-        title       = 'Ming Pao - Vancouver'
+        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
        else:
            title   = 'Ming Pao - Vancouver'
        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
        category    = 'Chinese, News, Vancouver'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -127,7 +143,10 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: ''),
                             ]
    elif __Region__ == 'Toronto':
-        title       = 'Ming Pao - Toronto'
+        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title   = 'Ming Pao - Toronto'
        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
        category    = 'Chinese, News, Toronto'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -161,9 +180,9 @@ class MPRecipe(BasicNewsRecipe):
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 5.30am, all news are available
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
-            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
        elif __Region__ == 'Vancouver':
            # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -185,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")
    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")    
    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")
    def get_fetchday(self):
        if __Date__ <> '':
@ -654,77 +685,153 @@ class MPRecipe(BasicNewsRecipe):
            del item['absmiddle']
        return soup
    def populate_article_metadata(self, article, soup, first):
        # thumbnails shouldn't be available if using hi-res images
        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
            img = soup.find('img')
            if img is not None:
                self.add_toc_thumbnail(article, img['src'])
        try:
            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
                # look for content
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'}) 
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
                            # the text may or may not be enclosed in <p></p> tag
                            paras = articlebody.findAll('p')
                            if not paras:
                            	paras = articlebody
                            textFound = False
                            for p in paras:
                                if not textFound:
                                    summary_candidate = self.tag_to_string(p).strip()
                                    summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
                                    if len(summary_candidate) > 0:
                                        article.summary = article.text_summary = summary_candidate
                                        textFound = True
            else:
                # display a simple text
                #article.summary = article.text_summary = u'\u66f4\u591a......'
                # display word counts
                counts = 0 
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'}) 
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        # the text may or may not be enclosed in <p></p> tag
                        paras = articlebody.findAll('p')
                        if not paras:
                            paras = articlebody
                        for p in paras:
                            summary_candidate = self.tag_to_string(p).strip()
                            counts += len(summary_candidate)
                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
        except:
            self.log("Error creating article descriptions")
            return
    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
-        if __UseChineseTitle__ == True:
+        title = self.short_title()
-            if __Region__ == 'Hong Kong':
+        # change 1: allow our own flag to tell if a periodical is to be generated
-                title = u'\u660e\u5831 (\u9999\u6e2f)'
+        # also use customed date instead of current time
-            elif __Region__ == 'Vancouver':
+        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
            elif __Region__ == 'Toronto':
                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title = self.short_title()
        # if not generating a periodical, force date to apply in title
        if __MakePeriodical__ == False:
            title = title + ' ' + self.get_fetchformatteddate()
-        if True:
+        # end of change 1
-            mi = MetaInformation(title, [self.publisher])
+        # change 2: __appname__ replaced by newspaper publisher
-            mi.publisher = self.publisher
+        __appname__ = self.publisher
-            mi.author_sort = self.publisher
+        mi = MetaInformation(title, [__appname__])
-            if __MakePeriodical__ == True:
+        mi.publisher = __appname__
-                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        mi.author_sort = __appname__
-            else:
+        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
-                mi.publication_type = self.publication_type+':'+self.short_title()
+        if __MakePeriodical__ == True:
-            #mi.timestamp = nowf()
+            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-            mi.timestamp = self.get_dtlocal()
+        else:
-            mi.comments = self.description
+            mi.publication_type = self.publication_type+':'+self.short_title()
-            if not isinstance(mi.comments, unicode):
+        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-                mi.comments = mi.comments.decode('utf-8', 'replace')
+        # change 4: in the following, all the nowf() are changed to adjusted time
-            #mi.pubdate = nowf()
+        # This one doesn't matter
-            mi.pubdate = self.get_dtlocal()
+        mi.timestamp = nowf()
-            opf_path = os.path.join(dir, 'index.opf')
+        # change 5: skip listing the articles
-            ncx_path = os.path.join(dir, 'index.ncx')
+        #article_titles, aseen = [], set()
-            opf = OPFCreator(dir, mi)
+        #for f in feeds:
-            # Add mastheadImage entry to <guide> section
+        #    for a in f:
-            mp = getattr(self, 'masthead_path', None)
+        #        if a.title and a.title not in aseen:
-            if mp is not None and os.access(mp, os.R_OK):
+        #            aseen.add(a.title)
-                from calibre.ebooks.metadata.opf2 import Guide
+        #            article_titles.append(force_unicode(a.title, 'utf-8'))
                ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
                ref.type = 'masthead'
                ref.title = 'Masthead Image'
                opf.guide.append(ref)
-            manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        #mi.comments = self.description
-            manifest.append(os.path.join(dir, 'index.html'))
+        #if not isinstance(mi.comments, unicode):
-            manifest.append(os.path.join(dir, 'index.ncx'))
+        #    mi.comments = mi.comments.decode('utf-8', 'replace')
        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
        #        '\n\n'.join(article_titles))
-            # Get cover
+        language = canonicalize_lang(self.language)
-            cpath = getattr(self, 'cover_path', None)
+        if language is not None:
-            if cpath is None:
+            mi.language = language
-                pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+        # This one affects the pub date shown in kindle title
-                if self.default_cover(pf):
+        #mi.pubdate = nowf()
-                    cpath =  pf.name
+        # now appears to need the time field to be > 12.00noon as well
-            if cpath is not None and os.access(cpath, os.R_OK):
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
-                opf.cover = cpath
+        opf_path = os.path.join(dir, 'index.opf')
-                manifest.append(cpath)
+        ncx_path = os.path.join(dir, 'index.ncx')
-            # Get masthead
+        opf = OPFCreator(dir, mi)
-            mpath = getattr(self, 'masthead_path', None)
+        # Add mastheadImage entry to <guide> section
-            if mpath is not None and os.access(mpath, os.R_OK):
+        mp = getattr(self, 'masthead_path', None)
-                manifest.append(mpath)
+        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)
        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)
        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}
            opf.create_manifest_from_files_in(manifest)
            for mani in opf.manifest:
                if mani.path.endswith('.ncx'):
                    mani.id = 'ncx'
                if mani.path.endswith('mastheadImage.jpg'):
                    mani.id = 'masthead-image'
            entries = ['index.html']
            toc = TOC(base_path=dir)
            self.play_order_counter = 0
            self.play_order_map = {}
        def feed_index(num, parent):
            f = feeds[num]
@ -739,13 +846,16 @@ class MPRecipe(BasicNewsRecipe):
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                    parent.add_item('%sindex.html'%adir, None,
-                                    play_order=po, author=auth, description=desc)
+                            a.title if a.title else _('Untitled Article'),
                            play_order=po, author=auth,
                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -762,7 +872,7 @@ class MPRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, self.publisher, prefix=prefix,
+                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -785,7 +895,7 @@ class MPRecipe(BasicNewsRecipe):
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
-                           f.title, play_order=po, description=desc, author=auth))
+                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
@ -798,4 +908,5 @@ class MPRecipe(BasicNewsRecipe):
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
--- a/recipes/ming_pao_toronto.recipe
+++ b/recipes/ming_pao_toronto.recipe
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
 # Region - Hong Kong, Vancouver, Toronto
 __Region__ = 'Toronto'
 # Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False".
+# please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
-# Turn below to true if your device supports display of CJK titles
+# Turn below to True if your device supports display of CJK titles (Default: False)
 __UseChineseTitle__ = False
-# Set it to False if you want to skip images
+# Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
-# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
+# Set it to True if you want to include a summary in Kindle's article view (Default: False)
 __IncludeSummary__ = False
 # Set it to True if you want thumbnail images in Kindle's article view (Default: True)
 __IncludeThumbnails__ = True
 # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
 __UseLife__ = True
 # (HK only) It is to disable premium content (Default: False)
 __InclPremium__ = False
 # (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
 __ParsePFF__ = True
 # (HK only) Turn below to True if you wish hi-res images (Default: False)
 __HiResImg__ = False
 # Override the date returned by the program if specifying a YYYYMMDD below
 __Date__ = ''
 '''
 Change Log:
 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day 
            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
 2011/10/19: fix a bug in txt source parsing
 2011/10/17: disable fetching of premium content, also improved txt source parsing
 2011/10/04: option to get hi-res photos for the articles
 2011/09/21: fetching "column" section is made optional. 
 2011/09/18: parse "column" section stuff from source text file directly.
 2011/09/07: disable "column" section as it is no longer offered free.
 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
            provide options to remove all images in the file
 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,39 @@ Change Log:
 2010/10/31: skip repeated articles in section pages
 '''
-import os, datetime, re
+from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
 from calibre.utils.date import now as nowf
 import os, datetime, re, mechanize
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.localization import canonicalize_lang
 # MAIN CLASS
 class MPRecipe(BasicNewsRecipe):
    if __Region__ == 'Hong Kong':
-        title       = 'Ming Pao - Hong Kong'
+        if __UseChineseTitle__ == True:
            title = u'\u660e\u5831 (\u9999\u6e2f)'
        else:
            title   = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
        keep_only_tags = [dict(name='h1'),
                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
                          dict(attrs={'class':['heading']}),  # for heading from txt
                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
                          dict(attrs={'id':['newscontent01','newscontent02']}),
                          dict(attrs={'class':['content']}),  # for content from txt
                          dict(attrs={'class':['photo']}),
                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
                          dict(attrs={'class':['images']})   # for images from txt
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='style'),
@ -90,7 +122,10 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: "</b>")
                             ]
    elif __Region__ == 'Vancouver':
-        title       = 'Ming Pao - Vancouver'
+        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
        else:
            title   = 'Ming Pao - Vancouver'
        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
        category    = 'Chinese, News, Vancouver'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -108,7 +143,10 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: ''),
                             ]
    elif __Region__ == 'Toronto':
-        title       = 'Ming Pao - Toronto'
+        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title   = 'Ming Pao - Toronto'
        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
        category    = 'Chinese, News, Toronto'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -139,49 +177,12 @@ class MPRecipe(BasicNewsRecipe):
    conversion_options = {'linearize_tables':True}
    timefmt = ''
    def image_url_processor(cls, baseurl, url):
        # trick: break the url at the first occurance of digit, add an additional
        # '_' at the front
        # not working, may need to move this to preprocess_html() method
 #        minIdx = 10000
 #        i0 = url.find('0')
 #        if i0 >= 0 and i0 < minIdx:
 #           minIdx = i0
 #        i1 = url.find('1')
 #        if i1 >= 0 and i1 < minIdx:
 #           minIdx = i1
 #        i2 = url.find('2')
 #        if i2 >= 0 and i2 < minIdx:
 #           minIdx = i2
 #        i3 = url.find('3')
 #        if i3 >= 0 and i0 < minIdx:
 #           minIdx = i3
 #        i4 = url.find('4')
 #        if i4 >= 0 and i4 < minIdx:
 #           minIdx = i4
 #        i5 = url.find('5')
 #        if i5 >= 0 and i5 < minIdx:
 #           minIdx = i5
 #        i6 = url.find('6')
 #        if i6 >= 0 and i6 < minIdx:
 #           minIdx = i6
 #        i7 = url.find('7')
 #        if i7 >= 0 and i7 < minIdx:
 #           minIdx = i7
 #        i8 = url.find('8')
 #        if i8 >= 0 and i8 < minIdx:
 #           minIdx = i8
 #        i9 = url.find('9')
 #        if i9 >= 0 and i9 < minIdx:
 #           minIdx = i9
        return url
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 5.30am, all news are available
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
-            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
        elif __Region__ == 'Vancouver':
            # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,13 +194,34 @@ class MPRecipe(BasicNewsRecipe):
        return dt_local
    def get_fetchdate(self):
-        return self.get_dtlocal().strftime("%Y%m%d")
+        if __Date__ <> '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")
    def get_fetchformatteddate(self):
-        return self.get_dtlocal().strftime("%Y-%m-%d")
+        if __Date__ <> '':
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")
    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")    
    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")
    def get_fetchday(self):
-        return self.get_dtlocal().strftime("%d")
+        if __Date__ <> '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")
    def get_cover_url(self):
        if __Region__ == 'Hong Kong':
@ -230,12 +252,23 @@ class MPRecipe(BasicNewsRecipe):
                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
-                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
+                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
-                                           (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
+                                          ]:
-                    articles = self.parse_section2(url, keystr)
+                    if __InclPremium__ == True:
                        articles = self.parse_section2_txt(url, keystr)
                    else:
                        articles = self.parse_section2(url, keystr)
                    if articles:
                        feeds.append((title, articles))
                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
                        articles = self.parse_section2_txt(url, keystr)
                        if articles:
                            feeds.append((title, articles))
                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
@ -244,15 +277,16 @@ class MPRecipe(BasicNewsRecipe):
            else:
                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
                                   (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))
                # special- editorial
-                ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                if ed_articles:
+                #if ed_articles:
-                    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,32 +297,46 @@ class MPRecipe(BasicNewsRecipe):
                # special - finance
                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-                fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-                if fin_articles:
+                #if fin_articles:
-                    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-                for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
-                                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+                    articles = self.parse_section2_txt(url, keystr)
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))
                #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
                #                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
                #    articles = self.parse_section(url)
                #    if articles:
                #        feeds.append((title, articles))
                # special - entertainment
-                ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-                if ent_articles:
+                #if ent_articles:
-                    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
                                          ]:
                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))
                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
                        articles = self.parse_section2_txt(url, keystr)
                        if articles:
                            feeds.append((title, articles))
                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))
                # special- columns
                col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
                if col_articles:
                    feeds.append((u'\u5c08\u6b04 Columns', col_articles))
        elif __Region__ == 'Vancouver':
            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +380,16 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(a)
            url = a.get('href', False)
            url = 'http://news.mingpao.com/' + dateStr + '/' +url
            # replace the url to the print-friendly version
            if __ParsePFF__ == True:
                if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
                    url = re.sub('%2F.*%2F', '/', url)
                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
                    url = url.replace('%2Etxt', '_print.htm')
                    url = url.replace('%5F', '_')
                else:
                    url = url.replace('.htm', '_print.htm')
            if url not in included_urls and url.rfind('Redirect') == -1:
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
                included_urls.append(url)
@ -340,6 +398,8 @@ class MPRecipe(BasicNewsRecipe):
    # parse from life.mingpao.com
    def parse_section2(self, url, keystr):
        br = mechanize.Browser()
        br.set_handle_redirect(False)
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
@ -350,12 +410,34 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
+                try: 
                    br.open_novisit(url)
                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
                    current_articles.append({'title': title, 'url': url, 'description': ''})
                    included_urls.append(url)
                except:
 				    print 'skipping a premium article'
        current_articles.reverse()
        return current_articles
    # parse from text file of life.mingpao.com
    def parse_section2_txt(self, url, keystr):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
        current_articles = []
        included_urls = []
        for i in a:
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')  # use printed version of the article
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles
-
+        
    # parse from www.mingpaovan.com
    def parse_section3(self, url, baseUrl):
        self.get_fetchdate()
@ -438,6 +520,162 @@ class MPRecipe(BasicNewsRecipe):
        current_articles.reverse()
        return current_articles
    # preprocess those .txt and javascript based files
    def preprocess_raw_html(self, raw_html, url):
        new_html = raw_html
        if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
            if url.rfind('_print.htm') <> -1:
                # javascript based file
                splitter = re.compile(r'\n')
                new_raw_html = '<html><head><title>Untitled</title></head>'
                new_raw_html = new_raw_html + '<body>'
                for item in splitter.split(raw_html):
                    if item.startswith('var heading1 ='):
                        heading = item.replace('var heading1 = \'', '')
                        heading = heading.replace('\'', '')
                        heading = heading.replace(';', '')
                        new_raw_html = new_raw_html + '<div class="heading">' + heading
                    if item.startswith('var heading2 ='):
                        heading = item.replace('var heading2 = \'', '')
                        heading = heading.replace('\'', '')
                        heading = heading.replace(';', '')
                        if heading <> '':
                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
                        else:
                            new_raw_html = new_raw_html + '</div>'
                    if item.startswith('var content ='):
                        content = item.replace("var content = ", '')
                        content = content.replace('\'', '')
                        content = content.replace(';', '')
                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
                    if item.startswith('var photocontent ='):
                        photo = item.replace('var photocontent = \'', '')
                        photo = photo.replace('\'', '')
                        photo = photo.replace(';', '')
                        photo = photo.replace('<tr>', '')
                        photo = photo.replace('<td>', '')
                        photo = photo.replace('</tr>', '')
                        photo = photo.replace('</td>', '<br>')
                        photo = photo.replace('class="photo"', '')
                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
                new_html = new_raw_html + '</body></html>'
            else: 
                # .txt based file
                splitter = re.compile(r'\n') # Match non-digits
                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
                next_is_img_txt = False
                title_started = False
                title_break_reached = False
                met_article_start_char = False
                for item in splitter.split(raw_html):
                    item = item.strip()
                    # if title already reached but break between title and content not yet found, record title_break_reached
                    if title_started == True and title_break_reached == False and item == '':
                        title_break_reached = True
                    # if title reached and title_break_reached and met_article_start_char == False and item is not empty
                    # start content
                    elif title_started == True and title_break_reached == True and met_article_start_char == False:
                        if item <> '':
                            met_article_start_char = True
                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    #if item.startswith(u'\u3010'):
                    #    met_article_start_char = True
                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    else:
                        if next_is_img_txt == False:
                            if item.startswith("=@"):
                                print 'skip movie link'
                            elif item.startswith("=?"):
                                next_is_img_txt = True
                                new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
                            elif item.startswith('=='):
                                next_is_img_txt = True
                                if False:
                                    # TODO: check existence of .gif first
                                    newimg = '_' + item[2:].strip() + '.jpg'
                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
                                else:
                                    new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
                            elif item.startswith('='):
                                next_is_img_txt = True
                                if False:
                                    # TODO: check existence of .gif first
                                    newimg = '_' + item[1:].strip() + '.jpg'
                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
                                else:
                                    new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
                            else:
                                if next_is_img_txt == False and met_article_start_char == False:
                                    if item <> '':
                                        if title_started == False:
                                            #print 'Title started at ', item
                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
                                            title_started = True
                                        else:
                                            new_raw_html = new_raw_html + item + '\n'
                                else:
                                    new_raw_html = new_raw_html + item + '<p>\n'
                        else:
                            next_is_img_txt = False
                            new_raw_html = new_raw_html + item + '\n'
                new_html = new_raw_html + '</div></body></html>'
        #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
        if __HiResImg__ == True:
            # TODO: add a _ in front of an image url
            if url.rfind('news.mingpao.com') > -1: 
                imglist =  re.findall('src="?.*?jpg"', new_html)
                br = mechanize.Browser()
                br.set_handle_redirect(False)
                for img in imglist:
                    gifimg = img.replace('jpg"', 'gif"')
                    try: 
                        br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except: 
                        # find the location of the first _
                        pos = img.find('_')
                        if pos > -1:
                            # if found, insert _ after the first _
                            newimg = img[0:pos] + '_' + img[pos:]
                            new_html = new_html.replace(img, newimg)
                        else: 
                            # if not found, insert _ after "
                            new_html = new_html.replace(img[1:], '"_' + img[1:])
            elif url.rfind('life.mingpao.com') > -1:
                imglist = re.findall('src=\'?.*?jpg\'', new_html)
                br = mechanize.Browser()
                br.set_handle_redirect(False)
                #print 'Img list: ', imglist, '\n'
                for img in imglist:
                    #print 'Found img: ', img
                    gifimg = img.replace('jpg\'', 'gif\'')
                    try:
                        gifurl = re.sub(r'dailynews.*txt', '', url)
                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except:
                        pos = img.rfind('/')
                        newimg = img[0:pos+1] + '_' + img[pos+1:]
                        new_html = new_html.replace(img, newimg)
                # repeat with src quoted by double quotes, for text parsed from src txt
                imglist = re.findall('src="?.*?jpg"', new_html)
                for img in imglist:
                    #print 'Found img: ', img
                    gifimg = img.replace('jpg"', 'gif"')
                    try:
                        #print 'url', url
                        pos = url.rfind('/')
                        gifurl = url[:pos+1]
                        #print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except:
                        pos = img.find('"')
                        newimg = img[0:pos+1] + '_' + img[pos+1:]
                        #print 'Use hi-res img', newimg
                        new_html = new_html.replace(img, newimg)
        return new_html
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
@ -446,78 +684,154 @@ class MPRecipe(BasicNewsRecipe):
        for item in soup.findAll(stype=True):
            del item['absmiddle']
        return soup
    def populate_article_metadata(self, article, soup, first):
        # thumbnails shouldn't be available if using hi-res images
        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
            img = soup.find('img')
            if img is not None:
                self.add_toc_thumbnail(article, img['src'])
        try:
            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
                # look for content
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'}) 
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
                            # the text may or may not be enclosed in <p></p> tag
                            paras = articlebody.findAll('p')
                            if not paras:
                            	paras = articlebody
                            textFound = False
                            for p in paras:
                                if not textFound:
                                    summary_candidate = self.tag_to_string(p).strip()
                                    summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
                                    if len(summary_candidate) > 0:
                                        article.summary = article.text_summary = summary_candidate
                                        textFound = True
            else:
                # display a simple text
                #article.summary = article.text_summary = u'\u66f4\u591a......'
                # display word counts
                counts = 0 
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'}) 
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        # the text may or may not be enclosed in <p></p> tag
                        paras = articlebody.findAll('p')
                        if not paras:
                            paras = articlebody
                        for p in paras:
                            summary_candidate = self.tag_to_string(p).strip()
                            counts += len(summary_candidate)
                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
        except:
            self.log("Error creating article descriptions")
            return
    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
-        if __UseChineseTitle__ == True:
+        title = self.short_title()
-            if __Region__ == 'Hong Kong':
+        # change 1: allow our own flag to tell if a periodical is to be generated
-                title = u'\u660e\u5831 (\u9999\u6e2f)'
+        # also use customed date instead of current time
-            elif __Region__ == 'Vancouver':
+        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
            elif __Region__ == 'Toronto':
                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title = self.short_title()
        # if not generating a periodical, force date to apply in title
        if __MakePeriodical__ == False:
            title = title + ' ' + self.get_fetchformatteddate()
-        if True:
+        # end of change 1
-            mi = MetaInformation(title, [self.publisher])
+        # change 2: __appname__ replaced by newspaper publisher
-            mi.publisher = self.publisher
+        __appname__ = self.publisher
-            mi.author_sort = self.publisher
+        mi = MetaInformation(title, [__appname__])
-            if __MakePeriodical__ == True:
+        mi.publisher = __appname__
-                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        mi.author_sort = __appname__
-            else:
+        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
-                mi.publication_type = self.publication_type+':'+self.short_title()
+        if __MakePeriodical__ == True:
-            #mi.timestamp = nowf()
+            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-            mi.timestamp = self.get_dtlocal()
+        else:
-            mi.comments = self.description
+            mi.publication_type = self.publication_type+':'+self.short_title()
-            if not isinstance(mi.comments, unicode):
+        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-                mi.comments = mi.comments.decode('utf-8', 'replace')
+        # change 4: in the following, all the nowf() are changed to adjusted time
-            #mi.pubdate = nowf()
+        # This one doesn't matter
-            mi.pubdate = self.get_dtlocal()
+        mi.timestamp = nowf()
-            opf_path = os.path.join(dir, 'index.opf')
+        # change 5: skip listing the articles
-            ncx_path = os.path.join(dir, 'index.ncx')
+        #article_titles, aseen = [], set()
-            opf = OPFCreator(dir, mi)
+        #for f in feeds:
-            # Add mastheadImage entry to <guide> section
+        #    for a in f:
-            mp = getattr(self, 'masthead_path', None)
+        #        if a.title and a.title not in aseen:
-            if mp is not None and os.access(mp, os.R_OK):
+        #            aseen.add(a.title)
-                from calibre.ebooks.metadata.opf2 import Guide
+        #            article_titles.append(force_unicode(a.title, 'utf-8'))
                ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
                ref.type = 'masthead'
                ref.title = 'Masthead Image'
                opf.guide.append(ref)
-            manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        #mi.comments = self.description
-            manifest.append(os.path.join(dir, 'index.html'))
+        #if not isinstance(mi.comments, unicode):
-            manifest.append(os.path.join(dir, 'index.ncx'))
+        #    mi.comments = mi.comments.decode('utf-8', 'replace')
        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
        #        '\n\n'.join(article_titles))
-            # Get cover
+        language = canonicalize_lang(self.language)
-            cpath = getattr(self, 'cover_path', None)
+        if language is not None:
-            if cpath is None:
+            mi.language = language
-                pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+        # This one affects the pub date shown in kindle title
-                if self.default_cover(pf):
+        #mi.pubdate = nowf()
-                    cpath =  pf.name
+        # now appears to need the time field to be > 12.00noon as well
-            if cpath is not None and os.access(cpath, os.R_OK):
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
-                opf.cover = cpath
+        opf_path = os.path.join(dir, 'index.opf')
-                manifest.append(cpath)
+        ncx_path = os.path.join(dir, 'index.ncx')
-            # Get masthead
+        opf = OPFCreator(dir, mi)
-            mpath = getattr(self, 'masthead_path', None)
+        # Add mastheadImage entry to <guide> section
-            if mpath is not None and os.access(mpath, os.R_OK):
+        mp = getattr(self, 'masthead_path', None)
-                manifest.append(mpath)
+        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)
        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)
        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}
            opf.create_manifest_from_files_in(manifest)
            for mani in opf.manifest:
                if mani.path.endswith('.ncx'):
                    mani.id = 'ncx'
                if mani.path.endswith('mastheadImage.jpg'):
                    mani.id = 'masthead-image'
            entries = ['index.html']
            toc = TOC(base_path=dir)
            self.play_order_counter = 0
            self.play_order_map = {}
        def feed_index(num, parent):
            f = feeds[num]
@ -532,13 +846,16 @@ class MPRecipe(BasicNewsRecipe):
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                    parent.add_item('%sindex.html'%adir, None,
-                                    play_order=po, author=auth, description=desc)
+                            a.title if a.title else _('Untitled Article'),
                            play_order=po, author=auth,
                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +872,7 @@ class MPRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, self.publisher, prefix=prefix,
+                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -578,7 +895,7 @@ class MPRecipe(BasicNewsRecipe):
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
-                           f.title, play_order=po, description=desc, author=auth))
+                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
@ -591,4 +908,5 @@ class MPRecipe(BasicNewsRecipe):
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
--- a/recipes/ming_pao_vancouver.recipe
+++ b/recipes/ming_pao_vancouver.recipe
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
 # Region - Hong Kong, Vancouver, Toronto
 __Region__ = 'Vancouver'
 # Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False".
+# please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
-# Turn below to true if your device supports display of CJK titles
+# Turn below to True if your device supports display of CJK titles (Default: False)
 __UseChineseTitle__ = False
-# Set it to False if you want to skip images
+# Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
-# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
+# Set it to True if you want to include a summary in Kindle's article view (Default: False)
 __IncludeSummary__ = False
 # Set it to True if you want thumbnail images in Kindle's article view (Default: True)
 __IncludeThumbnails__ = True
 # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
 __UseLife__ = True
 # (HK only) It is to disable premium content (Default: False)
 __InclPremium__ = False
 # (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
 __ParsePFF__ = True
 # (HK only) Turn below to True if you wish hi-res images (Default: False)
 __HiResImg__ = False
 # Override the date returned by the program if specifying a YYYYMMDD below
 __Date__ = ''
 '''
 Change Log:
 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day 
            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
 2011/10/19: fix a bug in txt source parsing
 2011/10/17: disable fetching of premium content, also improved txt source parsing
 2011/10/04: option to get hi-res photos for the articles
 2011/09/21: fetching "column" section is made optional. 
 2011/09/18: parse "column" section stuff from source text file directly.
 2011/09/07: disable "column" section as it is no longer offered free.
 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
            provide options to remove all images in the file
 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,39 @@ Change Log:
 2010/10/31: skip repeated articles in section pages
 '''
-import os, datetime, re
+from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode)
 from calibre.utils.date import now as nowf
 import os, datetime, re, mechanize
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.localization import canonicalize_lang
 # MAIN CLASS
 class MPRecipe(BasicNewsRecipe):
    if __Region__ == 'Hong Kong':
-        title       = 'Ming Pao - Hong Kong'
+        if __UseChineseTitle__ == True:
            title = u'\u660e\u5831 (\u9999\u6e2f)'
        else:
            title   = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
        keep_only_tags = [dict(name='h1'),
                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
                          dict(attrs={'class':['heading']}),  # for heading from txt
                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
                          dict(attrs={'id':['newscontent01','newscontent02']}),
                          dict(attrs={'class':['content']}),  # for content from txt
                          dict(attrs={'class':['photo']}),
                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
                          dict(attrs={'class':['images']})   # for images from txt
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='style'),
@ -90,7 +122,10 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: "</b>")
                             ]
    elif __Region__ == 'Vancouver':
-        title       = 'Ming Pao - Vancouver'
+        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
        else:
            title   = 'Ming Pao - Vancouver'
        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
        category    = 'Chinese, News, Vancouver'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -108,7 +143,10 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: ''),
                             ]
    elif __Region__ == 'Toronto':
-        title       = 'Ming Pao - Toronto'
+        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title   = 'Ming Pao - Toronto'
        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
        category    = 'Chinese, News, Toronto'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -139,49 +177,12 @@ class MPRecipe(BasicNewsRecipe):
    conversion_options = {'linearize_tables':True}
    timefmt = ''
    def image_url_processor(cls, baseurl, url):
        # trick: break the url at the first occurance of digit, add an additional
        # '_' at the front
        # not working, may need to move this to preprocess_html() method
 #        minIdx = 10000
 #        i0 = url.find('0')
 #        if i0 >= 0 and i0 < minIdx:
 #           minIdx = i0
 #        i1 = url.find('1')
 #        if i1 >= 0 and i1 < minIdx:
 #           minIdx = i1
 #        i2 = url.find('2')
 #        if i2 >= 0 and i2 < minIdx:
 #           minIdx = i2
 #        i3 = url.find('3')
 #        if i3 >= 0 and i0 < minIdx:
 #           minIdx = i3
 #        i4 = url.find('4')
 #        if i4 >= 0 and i4 < minIdx:
 #           minIdx = i4
 #        i5 = url.find('5')
 #        if i5 >= 0 and i5 < minIdx:
 #           minIdx = i5
 #        i6 = url.find('6')
 #        if i6 >= 0 and i6 < minIdx:
 #           minIdx = i6
 #        i7 = url.find('7')
 #        if i7 >= 0 and i7 < minIdx:
 #           minIdx = i7
 #        i8 = url.find('8')
 #        if i8 >= 0 and i8 < minIdx:
 #           minIdx = i8
 #        i9 = url.find('9')
 #        if i9 >= 0 and i9 < minIdx:
 #           minIdx = i9
        return url
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 5.30am, all news are available
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
-            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
        elif __Region__ == 'Vancouver':
            # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,13 +194,34 @@ class MPRecipe(BasicNewsRecipe):
        return dt_local
    def get_fetchdate(self):
-        return self.get_dtlocal().strftime("%Y%m%d")
+        if __Date__ <> '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")
    def get_fetchformatteddate(self):
-        return self.get_dtlocal().strftime("%Y-%m-%d")
+        if __Date__ <> '':
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")
    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")    
    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")
    def get_fetchday(self):
-        return self.get_dtlocal().strftime("%d")
+        if __Date__ <> '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")
    def get_cover_url(self):
        if __Region__ == 'Hong Kong':
@ -230,12 +252,23 @@ class MPRecipe(BasicNewsRecipe):
                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
-                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
+                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
-                                           (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
+                                          ]:
-                    articles = self.parse_section2(url, keystr)
+                    if __InclPremium__ == True:
                        articles = self.parse_section2_txt(url, keystr)
                    else:
                        articles = self.parse_section2(url, keystr)
                    if articles:
                        feeds.append((title, articles))
                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
                        articles = self.parse_section2_txt(url, keystr)
                        if articles:
                            feeds.append((title, articles))
                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
@ -244,15 +277,16 @@ class MPRecipe(BasicNewsRecipe):
            else:
                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
                                   (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))
                # special- editorial
-                ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                if ed_articles:
+                #if ed_articles:
-                    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,32 +297,46 @@ class MPRecipe(BasicNewsRecipe):
                # special - finance
                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-                fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-                if fin_articles:
+                #if fin_articles:
-                    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-                for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
-                                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+                    articles = self.parse_section2_txt(url, keystr)
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))
                #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
                #                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
                #    articles = self.parse_section(url)
                #    if articles:
                #        feeds.append((title, articles))
                # special - entertainment
-                ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-                if ent_articles:
+                #if ent_articles:
-                    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
                                          ]:
                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))
                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
                        articles = self.parse_section2_txt(url, keystr)
                        if articles:
                            feeds.append((title, articles))
                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))
                # special- columns
                col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
                if col_articles:
                    feeds.append((u'\u5c08\u6b04 Columns', col_articles))
        elif __Region__ == 'Vancouver':
            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +380,16 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(a)
            url = a.get('href', False)
            url = 'http://news.mingpao.com/' + dateStr + '/' +url
            # replace the url to the print-friendly version
            if __ParsePFF__ == True:
                if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
                    url = re.sub('%2F.*%2F', '/', url)
                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
                    url = url.replace('%2Etxt', '_print.htm')
                    url = url.replace('%5F', '_')
                else:
                    url = url.replace('.htm', '_print.htm')
            if url not in included_urls and url.rfind('Redirect') == -1:
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
                included_urls.append(url)
@ -340,6 +398,8 @@ class MPRecipe(BasicNewsRecipe):
    # parse from life.mingpao.com
    def parse_section2(self, url, keystr):
        br = mechanize.Browser()
        br.set_handle_redirect(False)
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
@ -350,12 +410,34 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
+                try: 
                    br.open_novisit(url)
                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
                    current_articles.append({'title': title, 'url': url, 'description': ''})
                    included_urls.append(url)
                except:
 				    print 'skipping a premium article'
        current_articles.reverse()
        return current_articles
    # parse from text file of life.mingpao.com
    def parse_section2_txt(self, url, keystr):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
        current_articles = []
        included_urls = []
        for i in a:
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')  # use printed version of the article
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles
-
+        
    # parse from www.mingpaovan.com
    def parse_section3(self, url, baseUrl):
        self.get_fetchdate()
@ -438,6 +520,162 @@ class MPRecipe(BasicNewsRecipe):
        current_articles.reverse()
        return current_articles
    # preprocess those .txt and javascript based files
    def preprocess_raw_html(self, raw_html, url):
        new_html = raw_html
        if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
            if url.rfind('_print.htm') <> -1:
                # javascript based file
                splitter = re.compile(r'\n')
                new_raw_html = '<html><head><title>Untitled</title></head>'
                new_raw_html = new_raw_html + '<body>'
                for item in splitter.split(raw_html):
                    if item.startswith('var heading1 ='):
                        heading = item.replace('var heading1 = \'', '')
                        heading = heading.replace('\'', '')
                        heading = heading.replace(';', '')
                        new_raw_html = new_raw_html + '<div class="heading">' + heading
                    if item.startswith('var heading2 ='):
                        heading = item.replace('var heading2 = \'', '')
                        heading = heading.replace('\'', '')
                        heading = heading.replace(';', '')
                        if heading <> '':
                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
                        else:
                            new_raw_html = new_raw_html + '</div>'
                    if item.startswith('var content ='):
                        content = item.replace("var content = ", '')
                        content = content.replace('\'', '')
                        content = content.replace(';', '')
                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
                    if item.startswith('var photocontent ='):
                        photo = item.replace('var photocontent = \'', '')
                        photo = photo.replace('\'', '')
                        photo = photo.replace(';', '')
                        photo = photo.replace('<tr>', '')
                        photo = photo.replace('<td>', '')
                        photo = photo.replace('</tr>', '')
                        photo = photo.replace('</td>', '<br>')
                        photo = photo.replace('class="photo"', '')
                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
                new_html = new_raw_html + '</body></html>'
            else: 
                # .txt based file
                splitter = re.compile(r'\n') # Match non-digits
                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
                next_is_img_txt = False
                title_started = False
                title_break_reached = False
                met_article_start_char = False
                for item in splitter.split(raw_html):
                    item = item.strip()
                    # if title already reached but break between title and content not yet found, record title_break_reached
                    if title_started == True and title_break_reached == False and item == '':
                        title_break_reached = True
                    # if title reached and title_break_reached and met_article_start_char == False and item is not empty
                    # start content
                    elif title_started == True and title_break_reached == True and met_article_start_char == False:
                        if item <> '':
                            met_article_start_char = True
                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    #if item.startswith(u'\u3010'):
                    #    met_article_start_char = True
                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    else:
                        if next_is_img_txt == False:
                            if item.startswith("=@"):
                                print 'skip movie link'
                            elif item.startswith("=?"):
                                next_is_img_txt = True
                                new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
                            elif item.startswith('=='):
                                next_is_img_txt = True
                                if False:
                                    # TODO: check existence of .gif first
                                    newimg = '_' + item[2:].strip() + '.jpg'
                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
                                else:
                                    new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
                            elif item.startswith('='):
                                next_is_img_txt = True
                                if False:
                                    # TODO: check existence of .gif first
                                    newimg = '_' + item[1:].strip() + '.jpg'
                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
                                else:
                                    new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
                            else:
                                if next_is_img_txt == False and met_article_start_char == False:
                                    if item <> '':
                                        if title_started == False:
                                            #print 'Title started at ', item
                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
                                            title_started = True
                                        else:
                                            new_raw_html = new_raw_html + item + '\n'
                                else:
                                    new_raw_html = new_raw_html + item + '<p>\n'
                        else:
                            next_is_img_txt = False
                            new_raw_html = new_raw_html + item + '\n'
                new_html = new_raw_html + '</div></body></html>'
        #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
        if __HiResImg__ == True:
            # TODO: add a _ in front of an image url
            if url.rfind('news.mingpao.com') > -1: 
                imglist =  re.findall('src="?.*?jpg"', new_html)
                br = mechanize.Browser()
                br.set_handle_redirect(False)
                for img in imglist:
                    gifimg = img.replace('jpg"', 'gif"')
                    try: 
                        br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except: 
                        # find the location of the first _
                        pos = img.find('_')
                        if pos > -1:
                            # if found, insert _ after the first _
                            newimg = img[0:pos] + '_' + img[pos:]
                            new_html = new_html.replace(img, newimg)
                        else: 
                            # if not found, insert _ after "
                            new_html = new_html.replace(img[1:], '"_' + img[1:])
            elif url.rfind('life.mingpao.com') > -1:
                imglist = re.findall('src=\'?.*?jpg\'', new_html)
                br = mechanize.Browser()
                br.set_handle_redirect(False)
                #print 'Img list: ', imglist, '\n'
                for img in imglist:
                    #print 'Found img: ', img
                    gifimg = img.replace('jpg\'', 'gif\'')
                    try:
                        gifurl = re.sub(r'dailynews.*txt', '', url)
                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except:
                        pos = img.rfind('/')
                        newimg = img[0:pos+1] + '_' + img[pos+1:]
                        new_html = new_html.replace(img, newimg)
                # repeat with src quoted by double quotes, for text parsed from src txt
                imglist = re.findall('src="?.*?jpg"', new_html)
                for img in imglist:
                    #print 'Found img: ', img
                    gifimg = img.replace('jpg"', 'gif"')
                    try:
                        #print 'url', url
                        pos = url.rfind('/')
                        gifurl = url[:pos+1]
                        #print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except:
                        pos = img.find('"')
                        newimg = img[0:pos+1] + '_' + img[pos+1:]
                        #print 'Use hi-res img', newimg
                        new_html = new_html.replace(img, newimg)
        return new_html
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
@ -446,78 +684,154 @@ class MPRecipe(BasicNewsRecipe):
        for item in soup.findAll(stype=True):
            del item['absmiddle']
        return soup
    def populate_article_metadata(self, article, soup, first):
        # thumbnails shouldn't be available if using hi-res images
        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
            img = soup.find('img')
            if img is not None:
                self.add_toc_thumbnail(article, img['src'])
        try:
            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
                # look for content
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'}) 
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
                            # the text may or may not be enclosed in <p></p> tag
                            paras = articlebody.findAll('p')
                            if not paras:
                            	paras = articlebody
                            textFound = False
                            for p in paras:
                                if not textFound:
                                    summary_candidate = self.tag_to_string(p).strip()
                                    summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
                                    if len(summary_candidate) > 0:
                                        article.summary = article.text_summary = summary_candidate
                                        textFound = True
            else:
                # display a simple text
                #article.summary = article.text_summary = u'\u66f4\u591a......'
                # display word counts
                counts = 0 
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'}) 
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        # the text may or may not be enclosed in <p></p> tag
                        paras = articlebody.findAll('p')
                        if not paras:
                            paras = articlebody
                        for p in paras:
                            summary_candidate = self.tag_to_string(p).strip()
                            counts += len(summary_candidate)
                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
        except:
            self.log("Error creating article descriptions")
            return
    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
-        if __UseChineseTitle__ == True:
+        title = self.short_title()
-            if __Region__ == 'Hong Kong':
+        # change 1: allow our own flag to tell if a periodical is to be generated
-                title = u'\u660e\u5831 (\u9999\u6e2f)'
+        # also use customed date instead of current time
-            elif __Region__ == 'Vancouver':
+        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
            elif __Region__ == 'Toronto':
                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title = self.short_title()
        # if not generating a periodical, force date to apply in title
        if __MakePeriodical__ == False:
            title = title + ' ' + self.get_fetchformatteddate()
-        if True:
+        # end of change 1
-            mi = MetaInformation(title, [self.publisher])
+        # change 2: __appname__ replaced by newspaper publisher
-            mi.publisher = self.publisher
+        __appname__ = self.publisher
-            mi.author_sort = self.publisher
+        mi = MetaInformation(title, [__appname__])
-            if __MakePeriodical__ == True:
+        mi.publisher = __appname__
-                mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        mi.author_sort = __appname__
-            else:
+        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
-                mi.publication_type = self.publication_type+':'+self.short_title()
+        if __MakePeriodical__ == True:
-            #mi.timestamp = nowf()
+            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-            mi.timestamp = self.get_dtlocal()
+        else:
-            mi.comments = self.description
+            mi.publication_type = self.publication_type+':'+self.short_title()
-            if not isinstance(mi.comments, unicode):
+        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-                mi.comments = mi.comments.decode('utf-8', 'replace')
+        # change 4: in the following, all the nowf() are changed to adjusted time
-            #mi.pubdate = nowf()
+        # This one doesn't matter
-            mi.pubdate = self.get_dtlocal()
+        mi.timestamp = nowf()
-            opf_path = os.path.join(dir, 'index.opf')
+        # change 5: skip listing the articles
-            ncx_path = os.path.join(dir, 'index.ncx')
+        #article_titles, aseen = [], set()
-            opf = OPFCreator(dir, mi)
+        #for f in feeds:
-            # Add mastheadImage entry to <guide> section
+        #    for a in f:
-            mp = getattr(self, 'masthead_path', None)
+        #        if a.title and a.title not in aseen:
-            if mp is not None and os.access(mp, os.R_OK):
+        #            aseen.add(a.title)
-                from calibre.ebooks.metadata.opf2 import Guide
+        #            article_titles.append(force_unicode(a.title, 'utf-8'))
                ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
                ref.type = 'masthead'
                ref.title = 'Masthead Image'
                opf.guide.append(ref)
-            manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        #mi.comments = self.description
-            manifest.append(os.path.join(dir, 'index.html'))
+        #if not isinstance(mi.comments, unicode):
-            manifest.append(os.path.join(dir, 'index.ncx'))
+        #    mi.comments = mi.comments.decode('utf-8', 'replace')
        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
        #        '\n\n'.join(article_titles))
-            # Get cover
+        language = canonicalize_lang(self.language)
-            cpath = getattr(self, 'cover_path', None)
+        if language is not None:
-            if cpath is None:
+            mi.language = language
-                pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+        # This one affects the pub date shown in kindle title
-                if self.default_cover(pf):
+        #mi.pubdate = nowf()
-                    cpath =  pf.name
+        # now appears to need the time field to be > 12.00noon as well
-            if cpath is not None and os.access(cpath, os.R_OK):
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
-                opf.cover = cpath
+        opf_path = os.path.join(dir, 'index.opf')
-                manifest.append(cpath)
+        ncx_path = os.path.join(dir, 'index.ncx')
-            # Get masthead
+        opf = OPFCreator(dir, mi)
-            mpath = getattr(self, 'masthead_path', None)
+        # Add mastheadImage entry to <guide> section
-            if mpath is not None and os.access(mpath, os.R_OK):
+        mp = getattr(self, 'masthead_path', None)
-                manifest.append(mpath)
+        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)
        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)
        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}
            opf.create_manifest_from_files_in(manifest)
            for mani in opf.manifest:
                if mani.path.endswith('.ncx'):
                    mani.id = 'ncx'
                if mani.path.endswith('mastheadImage.jpg'):
                    mani.id = 'masthead-image'
            entries = ['index.html']
            toc = TOC(base_path=dir)
            self.play_order_counter = 0
            self.play_order_map = {}
        def feed_index(num, parent):
            f = feeds[num]
@ -532,13 +846,16 @@ class MPRecipe(BasicNewsRecipe):
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                    parent.add_item('%sindex.html'%adir, None,
-                                    play_order=po, author=auth, description=desc)
+                            a.title if a.title else _('Untitled Article'),
                            play_order=po, author=auth,
                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +872,7 @@ class MPRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, self.publisher, prefix=prefix,
+                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -578,7 +895,7 @@ class MPRecipe(BasicNewsRecipe):
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
-                           f.title, play_order=po, description=desc, author=auth))
+                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
@ -591,4 +908,5 @@ class MPRecipe(BasicNewsRecipe):
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)