calibre/recipes/apple_daily.recipe

# vim:fileencoding=UTF-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013-2015, Eddie Lau'
__Date__ = ''

from calibre import (__appname__, force_unicode, strftime)
from calibre.utils.date import now as nowf
import os
import datetime
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang


class AppleDaily(BasicNewsRecipe):
    title = u'蘋果日報 (香港)'
    __author__ = 'Eddie Lau'
    publisher = '蘋果日報'
    publication_type= 'newspaper'
    oldest_article = 1
    max_articles_per_feed = 100
    auto_cleanup = False
    language = 'zh'
    encoding = 'utf-8'
    auto_cleanup = False
    remove_javascript = True
    use_embedded_content = False
    no_stylesheets = True
    description = 'http://hkm.appledaily.com/'
    category = 'Chinese, News, Hong Kong'
    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/8/86/Apple_Daily_Title.svg'

    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:125%; text-align:left; font-weight:bold;} p{font-size:90%;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'  # noqa
    keep_only_tags = [dict(name='div', attrs={'id': 'content-article'})]
    remove_tags = [dict(name='div', attrs={'class': 'prev-next-btn'}),
                   dict(name='p', attrs={'class': 'next'}),
                   dict(name='meta'),
                   dict(name='link')]

    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        # convert UTC to local hk time - at HKT 6am, all news are available
        return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24)

    def get_fetchdate(self):
        if __Date__ != '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")

    def get_fetchformatteddate(self):
        if __Date__ != '':
            return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")

    def get_fetchyear(self):
        if __Date__ != '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")

    def get_fetchmonth(self):
        if __Date__ != '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")

    def get_fetchday(self):
        if __Date__ != '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")

    # Note: does not work with custom date given by __Date__
    def get_weekday(self):
        return self.get_dtlocal().weekday()

    def get_cover_url(self):
        soup = self.index_to_soup('http://hkm.appledaily.com/')
        cover = soup.find(attrs={'class': 'top-news'}).get('src', False)
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
            cover = None
        return cover

    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article, picdiv['src'])

    def parse_index(self):
        feeds = []
        soup = self.index_to_soup('http://hkm.appledaily.com/')
        ul = soup.find(attrs={'class': 'menu'})
        sectionList = []
        for li in ul.findAll('li'):
            relativea = li.find('a', href=True).get('href', False)
            a = 'http://hkm.appledaily.com/' + relativea
            title = li.find('a', text=True).strip()
            # if (time.tzname != 'HKT'):
            #     if (title == u'三藩市'):
            #         continue
            #     if (title == u'洛杉磯'):
            #         continue
            #     if (title == u'紐&nbsp; 約'):
            #         continue
            #     if (title == u'美&nbsp; 國'):
            #         continue
            # if (not title == u'動新聞') and (relativea.startswith('list.php')):
            if (relativea.find('category=daily')!= -1)and (relativea.startswith('list.php')):
                sectionList.append((title, a))
        for title, url in sectionList:
            title = title.replace("&nbsp;", "")
            articles = self.parse_section(url)
            if articles:
                feeds.append((title, articles))
        return feeds

    def parse_section(self, url):
        soup = self.index_to_soup(url)
        ul = soup.find(attrs={'class': 'list'})
        current_articles = []
        if ul is None :
            return current_articles
        for li in ul.findAll('li'):
            a = li.find('a', href=True)
            title = li.find('p', text=True).strip()
            if a is not None:
                current_articles.append(
                    {'title': title, 'url': 'http://hkm.appledaily.com/' + a.get('href', False)})
            pass
        return current_articles

    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
        title = self.short_title()
        if self.output_profile.periodical_date_in_title:
            title += strftime(self.timefmt)
        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        if self.publication_type:
            mi.publication_type = 'periodical:' + \
                self.publication_type + ':' + self.short_title()
        mi.timestamp = nowf()
        article_titles, aseen = [], set()
        for f in feeds:
            for a in f:
                if a.title and a.title not in aseen:
                    aseen.add(a.title)
                    article_titles.append(force_unicode(a.title, 'utf-8'))

        mi.comments = self.description
        if not isinstance(mi.comments, type(u'')):
            mi.comments = mi.comments.decode('utf-8', 'replace')
        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
                        '\n\n'.join(article_titles))

        language = canonicalize_lang(self.language)
        if language is not None:
            mi.language = language
        # This one affects the pub date shown in kindle title
        # mi.pubdate = nowf()
        # now appears to need the time field to be > 12.00noon as well
        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
            self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')

        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(
                self.masthead_path), os.getcwd())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)

        manifest = [os.path.join(dir, 'feed_%d' % i)
                    for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))

        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath = pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)

        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)

        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'

        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}

        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/' % (num, j)
                    auth = a.author
                    if not auth:
                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html' % adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html' % adir, None,
                                    a.title if a.title else _(
                                        'Untitled Article'),
                                    play_order=po, author=auth,
                                    description=desc, toc_thumbnail=tt)
                    last = os.path.join(
                        self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp

                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        src = src.replace('height:260px !important;','')  # fix flow-player div tag parent
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2 *
                                                                 len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                                         not self.has_single_feed,
                                                         a.orig_url, __appname__, prefix=prefix,
                                                         center=self.center_navbar)
                            translatedTempl =re.sub(
                                '<hr.*<br','<hr>本篇由 '+__appname__+
                                ' 快取自 <a href="http://hkm.appledaily.com/" >蘋果日報</a> ; <a href="'+a.orig_url+'">本篇來源位置</a>。'+
                                '<br',templ.render(doctype='xhtml').decode('utf-8'),flags=re.S)
                            elem = BeautifulSoup(translatedTempl).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(type(u'')(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')

        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html' % i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
                                           f.title, play_order=po, description=desc, author=auth))

        else:
            entries.append('feed_%d/index.html' % 0)
            feed_index(0, toc)

        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)

        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
            opf.render(opf_file, ncx_file)