calibre/recipes/singtaohk.recipe

# vim:fileencoding=UTF-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011-2013, Eddie Lau'

# data source: normal, mobile
__Source__ = 'mobile'
# please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True
# Turn below to True if your device supports display of CJK titles
# (Default: False)
__UseChineseTitle__ = True
# Set it to False if you want to skip images (Default: True)
__KeepImages__ = True
# Set it to True if you want to include a summary in Kindle's article view
# (Default: False)
__IncludeSummary__ = True
# Set it to True if you want thumbnail images in Kindle's article view
# (Default: True)
__IncludeThumbnails__ = True


'''
Change Log:
2013/03/31 -- fix cover retrieval code and heading size, and remove &nbsp; in summary
2011/12/29 -- first version done
'''

from calibre.utils.date import now as nowf
import os
import datetime
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang

# MAIN CLASS


class STHKRecipe(BasicNewsRecipe):
    if __UseChineseTitle__ is True:
        title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)'
    else:
        title = 'Sing Tao Daily - Hong Kong'
    description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
    category = 'Chinese, News, Hong Kong'
    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'  # noqa
    masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
    if __Source__ == 'normal':
        keep_only_tags = [
            dict(name='td', attrs={'class': ['bodyhead', 'bodytext']})]
    else:
        keep_only_tags = [dict(name='td', attrs={'class': ['stmobheadline']}),
                          dict(name='img', attrs={'width': ['146']}),
                          dict(name='td', attrs={'class': ['bodytextg']}),
                          ]
    if __KeepImages__:
        remove_tags = [dict(name='hr')]
    else:
        remove_tags = [dict(name='hr'), dict(name='img')]
    remove_attributes = ['align']
    preprocess_regexps = [
        (re.compile(r'<font class="bodytext">', re.DOTALL | re.IGNORECASE),
         lambda match: '<br><br><font class="bodytext">'),
    ]

    oldest_article = 1
    max_articles_per_feed = 200
    __author__ = 'Eddie Lau'
    publisher = 'Sing Tao Ltd.'
    remove_javascript = True
    use_embedded_content = False
    no_stylesheets = True
    language = 'zh'
    encoding = 'Big5-HKSCS'
    recursions = 0
    conversion_options = {'linearize_tables': True}
    timefmt = ''
    auto_cleanup = False

    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        # convert UTC to local hk time - at HKT 4.00am, all news are available
        dt_local = dt_utc + \
            datetime.timedelta(8.0 / 24) - datetime.timedelta(4.0 / 24)
        return dt_local

    def get_fetchdate(self):
        return self.get_dtlocal().strftime("%Y%m%d")

    def get_fetchformatteddate(self):
        return self.get_dtlocal().strftime("%Y-%m-%d")

    def get_fetchyear(self):
        return self.get_dtlocal().strftime("%Y")

    def get_fetchmonth(self):
        return self.get_dtlocal().strftime("%m")

    def get_fetchday(self):
        return self.get_dtlocal().strftime("%d")

    def get_cover_url(self):
        soup = self.index_to_soup('http://m.singtao.com/')
        cover = soup.find(attrs={'class': 'special'}).get('src', False)
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
            cover = None
        return cover

    def parse_index(self):
        feeds = []
        dateStr = self.get_fetchdate()
        dateStr

        if __Source__ == 'normal':
            # single-item section
            for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]:
                article = self.parse_singleitem_section(url)
                if article:
                    feeds.append((title, article))

            # multiple items
    #        for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'),
    #                           (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'),
    #                           (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'),
    #                           (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'),
    #                           (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'),
    #                           (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'),
    #                           (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html')
    #                           ]:
    #            articles = self.parse_section(url)
    #            if articles:
    #                feeds.append((title, articles))

    # special: supplement
    #        for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]:
    #            articles = self.parse_section_withouttext(url, baseurl)
    #            if articles:
    #                feeds.append((title, articles))

    # multiple-item sections
    #        for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'),
    #                           (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html')
    #                           ]:
    #            articles = self.parse_section(url)
    #            if articles:
    #                feeds.append((title, articles))

            for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'),
                                        (u'\u8ca1\u7d93 Finance',
                                         'http://singtao.com/yesterday/fin/d_index.html', '/'),
                                        (u'\u5730\u7522 Properties',
                                         'http://singtao.com/yesterday/pro/h_index.html', '/'),
                                        (u'\u6559\u80b2 Education',
                                         'http://singtao.com/yesterday/edu/g_index.asp', '/'),
                                        (u'\u5a1b\u6a02 Entertainment',
                                         'http://singtao.com/yesterday/ent/f_index.html', '/'),
                                        (u'\u9ad4\u80b2 Sports',
                                         'http://singtao.com/yesterday/spo/c_index.html', '/'),
                                        (u'\u99ac\u7d93 Horse Racing',
                                         'http://singtao.com/yesterday/rac/n_index.html', '/'),
                                        (u'\u526f\u520a Supplements',
                                         'http://singtao.com/yesterday/sup/m_index.html', '/'),
                                        (u'\u570b\u969b World',
                                         'http://singtao.com/yesterday/int/b_index.html', '/'),
                                        (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]:
                articles = self.parse_section_withouttext(url, baseurl)
                if articles:
                    feeds.append((title, articles))
        else:  # use mobile
            # single-item section
            for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]:
                article = self.parse_singleitem_section_m(url)
                if article:
                    feeds.append((title, article))
            # multiple-item section
            for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'),
                                        (u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2',
                                         'http://m.singtao.com/'),
                                        (u'\u5730\u7522 Properties',
                                         'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'),
                                        (u'\u6559\u80b2 Education',
                                         'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'),
                                        (u'\u5a1b\u6a02 Entertainment',
                                         'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'),
                                        (u'\u99ac\u7d93 Horse Racing',
                                         'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'),
                                        (u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7',
                                         'http://m.singtao.com/'),
                                        (u'\u526f\u520a Supplements',
                                         'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'),
                                        (u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9',
                                         'http://m.singtao.com/'),
                                        (u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]:
                articles = self.parse_multiitem_section_m(url, baseurl)
                if articles:
                    feeds.append((title, articles))
        return feeds

    def parse_singleitem_section(self, url):
        current_articles = []
        current_articles.append(
            {'title': '', 'url': url, 'description': '', 'date': ''})
        return current_articles

    def parse_singleitem_section_m(self, url):
        current_articles = []
        current_articles.append(
            {'title': '', 'url': url, 'description': '', 'date': ''})
        return current_articles

    def parse_section(self, url):
        soup = self.index_to_soup(url)
        # find <table width=436 border=0 cellspacing=0 align=center
        # cellpadding=0> tag
        tables = soup.findAll(name={'table'}, attrs={'width': ['436']})
        current_articles_all = []
        for table in tables:
            divs = table.findAll(name={'a'})
            current_articles = []
            included_urls = []
            for i in divs:
                title = self.tag_to_string(i)
                urlstr = i.get('href', False)
                urlstr = url + '/../' + urlstr
                if urlstr not in included_urls:
                    current_articles.append(
                        {'title': title, 'url': urlstr, 'description': '', 'date': ''})
                    included_urls.append(urlstr)
            current_articles_all.extend(current_articles)
        return current_articles_all

    def parse_section_withouttext(self, url, baseurl):
        soup = self.index_to_soup(url)
        # find all a tag
        links = soup.findAll(name={'a'})
        linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'secondhead'})
        for elink in linksexcluded:
            links.remove(elink)
        linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'second02'})
        for elink in linksexcluded:
            links.remove(elink)
        current_articles_all = []
        included_urls = []
        for link in links:
            title = self.tag_to_string(link)
            if len(title.strip()) > 0:
                urlstr = link.get('href', False)
                if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1:
                    urlstr = url + '/../' + urlstr
                    if urlstr not in included_urls:
                        current_articles_all.append(
                            {'title': title, 'url': urlstr, 'description': '', 'date': ''})
                        included_urls.append(urlstr)
        return current_articles_all

    def parse_multiitem_section_m(self, url, baseurl):
        soup = self.index_to_soup(url)
        # find all a tag
        links = soup.findAll(name={'span'}, attrs={'class': 'urlurl'})
        current_articles_all = []
        included_urls = []
        for linkraw in links:
            linkclean = soup.findAll(name={'a'})
            for link in linkclean:
                title = self.tag_to_string(link)
                if len(title.strip()) > 0:
                    urlstr = link.get('href', False)
                    urlstr = baseurl + urlstr
                    if urlstr not in included_urls:
                        current_articles_all.append(
                            {'title': title, 'url': urlstr, 'description': '', 'date': ''})
                        included_urls.append(urlstr)
        return current_articles_all

    def populate_article_metadata(self, article, soup, first):
        if __Source__ == 'normal':
            # get title if not fetched in parse_section() function
            if article.title == '' or len(article.title.strip()) == 0:
                articletitle = soup.findAll('td', attrs={'class': 'bodyhead'})
                if articletitle:
                    articletitlemod = articletitle[0].find('font')
                    if articletitlemod:
                        article.title = articletitlemod.string.strip()
                    else:
                        article.title = articletitle[0].string.strip()
        else:
            # use the title in the text in any case
            articletitle = soup.findAll('td', attrs={'class': 'stmobheadline'})
            if articletitle:
                articletitle[0].br.extract()
                article.title = articletitle[0].contents[0]
        # get thumbnail image
        if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
            img = soup.find('img')
            if img is not None:
                self.add_toc_thumbnail(article, img['src'])

        try:
            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
                # look for content
                if __Source__ == 'normal':
                    articlebodies = soup.findAll(
                        'font', attrs={'class': 'bodytext'})
                else:
                    articlebodies = soup.findAll(
                        'div', attrs={'class': 'hkadj'})
                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
                            # the text may or may not be enclosed in <p></p>
                            # tag
                            paras = articlebody.findAll('p')
                            if not paras:
                                paras = articlebody
                            textFound = False
                            for p in paras:
                                if not textFound:
                                    summary_candidate = self.tag_to_string(
                                        p).strip().replace('&nbsp;', '')
                                    if len(summary_candidate) > 0:
                                        summary_candidate = summary_candidate.replace(
                                            u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
                                        article.summary = article.text_summary = summary_candidate
                                        textFound = True
            else:
                # display a simple text
                # article.summary = article.text_summary = u'\u66f4\u591a......'
                # display word counts
                counts = 0
                if __Source__ == 'normal':
                    articlebodies = soup.findAll(
                        'font', attrs={'class': 'bodytext'})
                else:
                    articlebodies = soup.findAll(
                        'div', attrs={'class': 'hkadj'})
                if articlebodies:
                    for articlebody in articlebodies:
                        # the text may or may not be enclosed in <p></p> tag
                        paras = articlebody.findAll('p')
                        if not paras:
                            paras = articlebody
                        for p in paras:
                            summary_candidate = self.tag_to_string(p).strip()
                            counts += len(summary_candidate)
                    article.summary = article.text_summary = u'\uff08' + \
                        str(counts) + u'\u5b57\uff09'
        except:
            self.log("Error creating article descriptions")
            return

    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
        title = self.short_title()
        # change 1: allow our own flag to tell if a periodical is to be generated
        # also use customed date instead of current time
        if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title:
            title = title + ' ' + self.get_fetchformatteddate()
        # end of change 1
        # change 2: __appname__ replaced by newspaper publisher
        __appname__ = self.publisher
        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        # change 3: use __MakePeriodical__ flag to tell if a periodical should
        # be generated
        if __MakePeriodical__ is True:
            mi.publication_type = 'periodical:' + \
                self.publication_type + ':' + self.short_title()
        else:
            mi.publication_type = self.publication_type + ':' + self.short_title()
        # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        # change 4: in the following, all the nowf() are changed to adjusted time
        # This one doesn't matter
        mi.timestamp = nowf()
        # change 5: skip listing the articles
        # article_titles, aseen = [], set()
        # for f in feeds:
        #    for a in f:
        #        if a.title and a.title not in aseen:
        #            aseen.add(a.title)
        #            article_titles.append(force_unicode(a.title, 'utf-8'))

        # mi.comments = self.description
        # if not isinstance(mi.comments, unicode):
        #    mi.comments = mi.comments.decode('utf-8', 'replace')
        # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
        #        '\n\n'.join(article_titles))

        language = canonicalize_lang(self.language)
        if language is not None:
            mi.language = language
        # This one affects the pub date shown in kindle title
        # mi.pubdate = nowf()
        # now appears to need the time field to be > 12.00noon as well
        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
            self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')

        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(
                self.masthead_path), os.getcwd())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)

        manifest = [os.path.join(dir, 'feed_%d' % i)
                    for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))

        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath = pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)

        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)

        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'

        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}

        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/' % (num, j)
                    auth = a.author
                    if not auth:
                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html' % adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html' % adir, None,
                                    a.title if a.title else (
                                        'Untitled Article'),
                                    play_order=po, author=auth,
                                    description=desc, toc_thumbnail=tt)
                    last = os.path.join(
                        self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp

                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2 *
                                                                 len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                                         not self.has_single_feed,
                                                         a.orig_url, __appname__, prefix=prefix,
                                                         center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(
                                doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(type(u'')(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')

        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html' % i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
                                           f.title, play_order=po, description=desc, author=auth))

        else:
            entries.append('feed_%d/index.html' % 0)
            feed_index(0, toc)

        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)

        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
            opf.render(opf_file, ncx_file)