__license__   = 'GPL v3'
__copyright__ = '2010-2011, Eddie Lau'

# Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Vancouver'
# Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True
# Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False
# Set it to False if you want to skip images (Default: True)
__KeepImages__ = True
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True
# (HK only) It is to disable premium content (Default: False)
__InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
__ParsePFF__ = True
# (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False
# Override the date returned by the program if specifying a YYYYMMDD below
__Date__ = ''


'''
Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
            provide options to remove all images in the file
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
2011/03/06: add new articles for finance section, also a new section "Columns"
2011/02/28: rearrange the sections
            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
            View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
            folder in Kindle 3
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
            clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
            ordering of articles
2010/11/12: add news image and eco-news section
2010/11/08: add parsing of finance section
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
            in section/article list.
2010/10/31: skip repeated articles in section pages
'''

from calibre.utils.date import now as nowf
import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang

# MAIN CLASS
class MPRecipe(BasicNewsRecipe):
    if __Region__ == 'Hong Kong':
        if __UseChineseTitle__ == True:
            title = u'\u660e\u5831 (\u9999\u6e2f)'
        else:
            title   = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
        keep_only_tags = [dict(name='h1'),
                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
                          dict(attrs={'class':['heading']}),  # for heading from txt
                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
                          dict(attrs={'id':['newscontent01','newscontent02']}),
                          dict(attrs={'class':['content']}),  # for content from txt
                          dict(attrs={'class':['photo']}),
                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
                          dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
                          dict(attrs={'class':['images']})   # for images from txt
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='style'),
                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
                           #dict(name='table')  # for content fetched from life.mingpao.com
                          ]
        else:
            remove_tags = [dict(name='style'),
                           dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
                           dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
                           dict(name='img'),
                           #dict(name='table')  # for content fetched from life.mingpao.com
                          ]
        remove_attributes = ['width']
        preprocess_regexps = [
                              (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
                              lambda match: '<h1>'),
                              (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
                              lambda match: '</h1>'),
                              (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
                              lambda match: ''),
                              # skip <br> after title in life.mingpao.com fetched article
                              (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
                              lambda match: "<div id='newscontent'>"),
                              (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
                              lambda match: "</b>")
                             ]
    elif __Region__ == 'Vancouver':
        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
        else:
            title   = 'Ming Pao - Vancouver'
        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
        category    = 'Chinese, News, Vancouver'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
        masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
        else:
            remove_tags = [dict(name='img')]
        remove_attributes = ['width']
        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
                              lambda match: ''),
                             ]
    elif __Region__ == 'Toronto':
        if __UseChineseTitle__ == True:
            title   = u'\u660e\u5831 (\u591a\u502b\u591a)'
        else:
            title   = 'Ming Pao - Toronto'
        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
        category    = 'Chinese, News, Toronto'
        extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
        masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
        keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
                          dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
                          dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
        else:
            remove_tags = [dict(name='img')]
        remove_attributes = ['width']
        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL|re.IGNORECASE),
                              lambda match: ''),
                             ]

    oldest_article = 1
    max_articles_per_feed = 100
    __author__            = 'Eddie Lau'
    publisher             = 'MingPao'
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    language = 'zh'
    encoding = 'Big5-HKSCS'
    recursions = 0
    conversion_options = {'linearize_tables':True}
    timefmt = ''

    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        if __Region__ == 'Hong Kong':
            # convert UTC to local hk time - at HKT 4.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
        elif __Region__ == 'Vancouver':
            # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
            #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24)
        elif __Region__ == 'Toronto':
            # convert UTC to local Toronto time - at EST time 8.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(8.5/24)
            #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24)
        return dt_local

    def get_fetchdate(self):
        if __Date__ <> '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")

    def get_fetchformatteddate(self):
        if __Date__ <> '':
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")

    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")

    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")

    def get_fetchday(self):
        if __Date__ <> '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")

    def get_cover_url(self):
        if __Region__ == 'Hong Kong':
            cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
        elif __Region__ == 'Vancouver':
            cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
        elif __Region__ == 'Toronto':
            cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(cover)
        except:
            cover = None
        return cover

    def parse_index(self):
        feeds = []
        dateStr = self.get_fetchdate()

        if __Region__ == 'Hong Kong':
            if __UseLife__:
                for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
                                           (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
                                           (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
                                           (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
                                           (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
                                          ]:
                    if __InclPremium__ == True:
                        articles = self.parse_section2_txt(url, keystr)
                    else:
                        articles = self.parse_section2(url, keystr)
                    if articles:
                        feeds.append((title, articles))

                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
                        articles = self.parse_section2_txt(url, keystr)
                        if articles:
                            feeds.append((title, articles))

                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))
            else:
                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
                                   (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))

                # special- editorial
                #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
                #if ed_articles:
                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))

                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
                                   (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))

                # special - finance
                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
                #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
                #if fin_articles:
                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))

                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))

                #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
                #                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
                #    articles = self.parse_section(url)
                #    if articles:
                #        feeds.append((title, articles))

                # special - entertainment
                #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
                #if ent_articles:
                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))

                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
                                          ]:
                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))

                if __InclPremium__ == True:
                    # parse column section articles directly from .txt files
                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                              ]:
                        articles = self.parse_section2_txt(url, keystr)
                        if articles:
                            feeds.append((title, articles))

                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))

        elif __Region__ == 'Vancouver':
            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
                               (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
                               (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
                               (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
                               (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
                articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
                if articles:
                    feeds.append((title, articles))
        elif __Region__ == 'Toronto':
            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
                               (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
                               (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
                               (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
                               (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
                               (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
                               (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
                articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
                if articles:
                    feeds.append((title, articles))
        return feeds

    # parse from news.mingpao.com
    def parse_section(self, url):
        dateStr = self.get_fetchdate()
        soup = self.index_to_soup(url)
        divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
        current_articles = []
        included_urls = []
        divs.reverse()
        for i in divs:
            a = i.find('a', href = True)
            title = self.tag_to_string(a)
            url = a.get('href', False)
            url = 'http://news.mingpao.com/' + dateStr + '/' +url
            # replace the url to the print-friendly version
            if __ParsePFF__ == True:
                if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
                    url = re.sub('%2F.*%2F', '/', url)
                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
                    url = url.replace('%2Etxt', '_print.htm')
                    url = url.replace('%5F', '_')
                else:
                    url = url.replace('.htm', '_print.htm')
            if url not in included_urls and url.rfind('Redirect') == -1:
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles

    # parse from life.mingpao.com
    def parse_section2(self, url, keystr):
        br = mechanize.Browser()
        br.set_handle_redirect(False)
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
        current_articles = []
        included_urls = []
        for i in a:
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
                try:
                    br.open_novisit(url)
                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
                    current_articles.append({'title': title, 'url': url, 'description': ''})
                    included_urls.append(url)
                except:
				    print 'skipping a premium article'
        current_articles.reverse()
        return current_articles

    # parse from text file of life.mingpao.com
    def parse_section2_txt(self, url, keystr):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
        current_articles = []
        included_urls = []
        for i in a:
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')  # use printed version of the article
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles

    # parse from www.mingpaovan.com
    def parse_section3(self, url, baseUrl):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
        current_articles = []
        included_urls = []
        divs.reverse()
        for i in divs:
            title = self.tag_to_string(i)
            urlstr = i.get('href', False)
            urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
            if urlstr not in included_urls:
                current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
                included_urls.append(urlstr)
        current_articles.reverse()
        return current_articles

    def parse_ed_section(self, url):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
        current_articles = []
        included_urls = []
        for i in a:
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles

    def parse_fin_section(self, url):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href= True)
        current_articles = []
        included_urls = []
        for i in a:
            #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
            if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
                title = self.tag_to_string(i)
                current_articles.append({'title': title, 'url': url, 'description':''})
                included_urls.append(url)
        return current_articles

    def parse_ent_section(self, url):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
        current_articles = []
        included_urls = []
        for i in a:
            title = self.tag_to_string(i)
            url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles

    def parse_col_section(self, url):
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
        a.reverse()
        current_articles = []
        included_urls = []
        for i in a:
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
                current_articles.append({'title': title, 'url': url, 'description': ''})
                included_urls.append(url)
        current_articles.reverse()
        return current_articles

    # preprocess those .txt and javascript based files
    def preprocess_raw_html(self, raw_html, url):
        new_html = raw_html
        if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
            if url.rfind('_print.htm') <> -1:
                # javascript based file
                splitter = re.compile(r'\n')
                new_raw_html = '<html><head><title>Untitled</title></head>'
                new_raw_html = new_raw_html + '<body>'
                for item in splitter.split(raw_html):
                    if item.startswith('var heading1 ='):
                        heading = item.replace('var heading1 = \'', '')
                        heading = heading.replace('\'', '')
                        heading = heading.replace(';', '')
                        new_raw_html = new_raw_html + '<div class="heading">' + heading
                    if item.startswith('var heading2 ='):
                        heading = item.replace('var heading2 = \'', '')
                        heading = heading.replace('\'', '')
                        heading = heading.replace(';', '')
                        if heading <> '':
                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
                        else:
                            new_raw_html = new_raw_html + '</div>'
                    if item.startswith('var content ='):
                        content = item.replace("var content = ", '')
                        content = content.replace('\'', '')
                        content = content.replace(';', '')
                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
                    if item.startswith('var photocontent ='):
                        photo = item.replace('var photocontent = \'', '')
                        photo = photo.replace('\'', '')
                        photo = photo.replace(';', '')
                        photo = photo.replace('<tr>', '')
                        photo = photo.replace('<td>', '')
                        photo = photo.replace('</tr>', '')
                        photo = photo.replace('</td>', '<br>')
                        photo = photo.replace('class="photo"', '')
                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
                new_html = new_raw_html + '</body></html>'
            else:
                # .txt based file
                splitter = re.compile(r'\n') # Match non-digits
                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
                next_is_img_txt = False
                title_started = False
                title_break_reached = False
                met_article_start_char = False
                for item in splitter.split(raw_html):
                    item = item.strip()
                    # if title already reached but break between title and content not yet found, record title_break_reached
                    if title_started == True and title_break_reached == False and item == '':
                        title_break_reached = True
                    # if title reached and title_break_reached and met_article_start_char == False and item is not empty
                    # start content
                    elif title_started == True and title_break_reached == True and met_article_start_char == False:
                        if item <> '':
                            met_article_start_char = True
                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    #if item.startswith(u'\u3010'):
                    #    met_article_start_char = True
                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    else:
                        if next_is_img_txt == False:
                            if item.startswith("=@"):
                                print 'skip movie link'
                            elif item.startswith("=?"):
                                next_is_img_txt = True
                                new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
                            elif item.startswith('=='):
                                next_is_img_txt = True
                                if False:
                                    # TODO: check existence of .gif first
                                    newimg = '_' + item[2:].strip() + '.jpg'
                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
                                else:
                                    new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
                            elif item.startswith('='):
                                next_is_img_txt = True
                                if False:
                                    # TODO: check existence of .gif first
                                    newimg = '_' + item[1:].strip() + '.jpg'
                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
                                else:
                                    new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
                            else:
                                if next_is_img_txt == False and met_article_start_char == False:
                                    if item <> '':
                                        if title_started == False:
                                            #print 'Title started at ', item
                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
                                            title_started = True
                                        else:
                                            new_raw_html = new_raw_html + item + '\n'
                                else:
                                    new_raw_html = new_raw_html + item + '<p>\n'
                        else:
                            next_is_img_txt = False
                            new_raw_html = new_raw_html + item + '\n'
                new_html = new_raw_html + '</div></body></html>'
        #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
        if __HiResImg__ == True:
            # TODO: add a _ in front of an image url
            if url.rfind('news.mingpao.com') > -1:
                imglist =  re.findall('src="?.*?jpg"', new_html)
                br = mechanize.Browser()
                br.set_handle_redirect(False)
                for img in imglist:
                    gifimg = img.replace('jpg"', 'gif"')
                    try:
                        br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except:
                        # find the location of the first _
                        pos = img.find('_')
                        if pos > -1:
                            # if found, insert _ after the first _
                            newimg = img[0:pos] + '_' + img[pos:]
                            new_html = new_html.replace(img, newimg)
                        else:
                            # if not found, insert _ after "
                            new_html = new_html.replace(img[1:], '"_' + img[1:])
            elif url.rfind('life.mingpao.com') > -1:
                imglist = re.findall('src=\'?.*?jpg\'', new_html)
                br = mechanize.Browser()
                br.set_handle_redirect(False)
                #print 'Img list: ', imglist, '\n'
                for img in imglist:
                    #print 'Found img: ', img
                    gifimg = img.replace('jpg\'', 'gif\'')
                    try:
                        gifurl = re.sub(r'dailynews.*txt', '', url)
                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except:
                        pos = img.rfind('/')
                        newimg = img[0:pos+1] + '_' + img[pos+1:]
                        new_html = new_html.replace(img, newimg)
                # repeat with src quoted by double quotes, for text parsed from src txt
                imglist = re.findall('src="?.*?jpg"', new_html)
                for img in imglist:
                    #print 'Found img: ', img
                    gifimg = img.replace('jpg"', 'gif"')
                    try:
                        #print 'url', url
                        pos = url.rfind('/')
                        gifurl = url[:pos+1]
                        #print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
                        new_html = new_html.replace(img, gifimg)
                    except:
                        pos = img.find('"')
                        newimg = img[0:pos+1] + '_' + img[pos+1:]
                        #print 'Use hi-res img', newimg
                        new_html = new_html.replace(img, newimg)
        return new_html

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll(style=True):
            del item['width']
        for item in soup.findAll(stype=True):
            del item['absmiddle']
        return soup

    def populate_article_metadata(self, article, soup, first):
        # thumbnails shouldn't be available if using hi-res images
        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
            img = soup.find('img')
            if img is not None:
                self.add_toc_thumbnail(article, img['src'])

        try:
            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
                # look for content
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'})
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
                            # the text may or may not be enclosed in <p></p> tag
                            paras = articlebody.findAll('p')
                            if not paras:
                            	paras = articlebody
                            textFound = False
                            for p in paras:
                                if not textFound:
                                    summary_candidate = self.tag_to_string(p).strip()
                                    summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
                                    if len(summary_candidate) > 0:
                                        article.summary = article.text_summary = summary_candidate
                                        textFound = True
            else:
                # display a simple text
                #article.summary = article.text_summary = u'\u66f4\u591a......'
                # display word counts
                counts = 0
                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
                if not articlebodies:
                    articlebodies = soup.findAll('div',attrs={'class':'content'})
                if not articlebodies:
                    articlebodies = soup.findAll('div', attrs={'id':'font'})
                if articlebodies:
                    for articlebody in articlebodies:
                        # the text may or may not be enclosed in <p></p> tag
                        paras = articlebody.findAll('p')
                        if not paras:
                            paras = articlebody
                        for p in paras:
                            summary_candidate = self.tag_to_string(p).strip()
                            counts += len(summary_candidate)
                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
        except:
            self.log("Error creating article descriptions")
            return

    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
        title = self.short_title()
        # change 1: allow our own flag to tell if a periodical is to be generated
        # also use customed date instead of current time
        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
            title = title + ' ' + self.get_fetchformatteddate()
        # end of change 1
        # change 2: __appname__ replaced by newspaper publisher
        __appname__ = self.publisher
        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
        if __MakePeriodical__ == True:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        else:
            mi.publication_type = self.publication_type+':'+self.short_title()
        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        # change 4: in the following, all the nowf() are changed to adjusted time
        # This one doesn't matter
        mi.timestamp = nowf()
        # change 5: skip listing the articles
        #article_titles, aseen = [], set()
        #for f in feeds:
        #    for a in f:
        #        if a.title and a.title not in aseen:
        #            aseen.add(a.title)
        #            article_titles.append(force_unicode(a.title, 'utf-8'))

        #mi.comments = self.description
        #if not isinstance(mi.comments, unicode):
        #    mi.comments = mi.comments.decode('utf-8', 'replace')
        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
        #        '\n\n'.join(article_titles))

        language = canonicalize_lang(self.language)
        if language is not None:
            mi.language = language
        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
        # now appears to need the time field to be > 12.00noon as well
        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')

        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)

        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))

        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)

        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)

        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'

        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}


        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(num, j)
                    auth = a.author
                    if not auth:
                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html'%adir, None,
                            a.title if a.title else _('Untitled Article'),
                            play_order=po, author=auth,
                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp

                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(unicode(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')

        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                    f.title, play_order=po, description=desc, author=auth))

        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)

        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)

        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)