Update Sing Tao Daily - Hong Kong and Apple Daily - Hong Kong. AM730 by Eddie Lau

2025-11-02 18:47:01 -05:00 · 2013-04-04 10:38:11 +05:30 · 2013-04-04 10:38:11 +05:30 · d65fd352e9
commit d65fd352e9
parent 3823f8da9f
3 changed files with 553 additions and 152 deletions
--- a/recipes/am730.recipe
+++ b/recipes/am730.recipe
@ -0,0 +1,290 @@
 # vim:fileencoding=UTF-8
 from __future__ import unicode_literals
 __license__   = 'GPL v3'
 __copyright__ = '2013, Eddie Lau'
 __Date__ = ''
 __HiResImg__ = True
 '''
 Change Log:
 2013/03/30 -- first version
 '''
 from calibre import (__appname__, force_unicode, strftime)
 from calibre.utils.date import now as nowf
 import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.localization import canonicalize_lang
 class AppleDaily(BasicNewsRecipe):
    title          = u'AM730'
    __author__     = 'Eddie Lau'
    publisher      = 'AM730'
    oldest_article = 1
    max_articles_per_feed = 100
    auto_cleanup = False
    language = 'zh'
    encoding = 'utf-8'
    auto_cleanup = False
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    description = 'http://www.am730.com.hk'
    category    = 'Chinese, News, Hong Kong'
    masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
    keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
                      dict(name='div', attrs={'class':'thecontent wordsnap'}),
                      dict(name='a', attrs={'class':'lightboximg'})]
    remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
                   dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        # convert UTC to local hk time - at HKT 6am, all news are available
        return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
    def get_fetchdate(self):
        if __Date__ <> '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")
    def get_fetchformatteddate(self):
        if __Date__ <> '':
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")
    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")
    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")
    def get_fetchday(self):
        if __Date__ <> '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")
    # Note: does not work with custom date given by __Date__
    def get_weekday(self):
        return self.get_dtlocal().weekday()
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def parse_index(self):
        feeds = []
        soup = self.index_to_soup('http://www.am730.com.hk/')
        ul = soup.find(attrs={'class':'nav-section'})
        sectionList = []
        for li in ul.findAll('li'):
            a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
            title = li.find('a').get('title', False).strip()
            sectionList.append((title, a))
        for title, url in sectionList:
            articles = self.parse_section(url)
            if articles:
                feeds.append((title, articles))
        return feeds
    def parse_section(self, url):
        soup = self.index_to_soup(url)
        items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
        current_articles = []
        for item in items:
            a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
            articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
            title = self.tag_to_string(a)
            description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
            current_articles.append({'title': title, 'url': articlelink, 'description': description})
        return current_articles
    def preprocess_html(self, soup):
        multia = soup.findAll('a')
        for a in multia:
            if not (a == None):
                image = a.find('img')
                if not (image == None):
                    if __HiResImg__:
                        image['src'] = image.get('src').replace('/thumbs/', '/')
                    caption = image.get('alt')
                    tag = Tag(soup, "photo", [])
                    tag2 = Tag(soup, "photocaption", [])
                    tag.insert(0, image)
                    if not caption == None:
                        tag2.insert(0, caption)
                        tag.insert(1, tag2)
                    a.replaceWith(tag)
        return soup
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
        title = self.short_title()
        if self.output_profile.periodical_date_in_title:
            title += strftime(self.timefmt)
        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        if self.publication_type:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        mi.timestamp = nowf()
        article_titles, aseen = [], set()
        for f in feeds:
            for a in f:
                if a.title and a.title not in aseen:
                    aseen.add(a.title)
                    article_titles.append(force_unicode(a.title, 'utf-8'))
        mi.comments = self.description
        if not isinstance(mi.comments, unicode):
            mi.comments = mi.comments.decode('utf-8', 'replace')
        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
                '\n\n'.join(article_titles))
        language = canonicalize_lang(self.language)
        if language is not None:
            mi.language = language
        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
        # now appears to need the time field to be > 12.00noon as well
        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)
        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)
        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}
        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(num, j)
                    auth = a.author
                    if not auth:
                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html'%adir, None,
                            a.title if a.title else _('Untitled Article'),
                            play_order=po, author=auth,
                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp
                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(unicode(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')
        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)
        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
--- a/recipes/apple_daily.recipe
+++ b/recipes/apple_daily.recipe
@ -1,161 +1,275 @@
-# -*- coding: utf-8 -*-
+# vim:fileencoding=UTF-8
-import re
+from __future__ import unicode_literals
 __license__   = 'GPL v3'
 __copyright__ = '2013, Eddie Lau'
 __Date__ = ''
 from calibre import (__appname__, force_unicode, strftime)
 from calibre.utils.date import now as nowf
 import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.localization import canonicalize_lang
 class AppleDaily(BasicNewsRecipe):
-
+    title          = u'蘋果日報 (香港)'
-    title       = u'蘋果日報'
+    __author__     = 'Eddie Lau'
-    __author__  = u'蘋果日報'
+    publisher      = '蘋果日報'
-    __publisher__  = u'蘋果日報'
+    oldest_article = 1
-    description = u'蘋果日報'
+    max_articles_per_feed = 100
-    masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
+    auto_cleanup = False
-    language = 'zh_TW'
+    language = 'zh'
-    encoding = 'UTF-8'
+    encoding = 'utf-8'
-    timefmt = ' [%a, %d %b, %Y]'
+    auto_cleanup = False
    needs_subscription = False
    remove_javascript = True
-    remove_tags_before = dict(name=['ul', 'h1'])
+    use_embedded_content   = False
    remove_tags_after  = dict(name='form')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
                dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
                dict(name=['script', 'noscript', 'style', 'form'])]
    no_stylesheets = True
-    extra_css = '''
+    description = 'http://hkm.appledaily.com/'
-    	@font-face {font-family: "uming", serif, sans-serif;  src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
+    category    = 'Chinese, News, Hong Kong'
-	    body {margin-right: 8pt; font-family: 'uming', serif;}
+    masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'
        h1 {font-family: 'uming', serif, sans-serif}
            '''
    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
-    preprocess_regexps = [
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
-       (re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
+    keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
-        lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
+    remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
-    ]
+                   dict(name='p', attrs={'class':'next'})]
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        # convert UTC to local hk time - at HKT 6am, all news are available
        return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
    def get_fetchdate(self):
        if __Date__ <> '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")
    def get_fetchformatteddate(self):
        if __Date__ <> '':
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")
    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")
    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")
    def get_fetchday(self):
        if __Date__ <> '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")
    # Note: does not work with custom date given by __Date__
    def get_weekday(self):
        return self.get_dtlocal().weekday()
    def get_cover_url(self):
-        return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
+        soup = self.index_to_soup('http://hkm.appledaily.com/')
-
+        cover = soup.find(attrs={'class':'top-news'}).get('src', False)
-
+        br = BasicNewsRecipe.get_browser(self)
-    #def get_browser(self):
+        try:
-        #br = BasicNewsRecipe.get_browser(self)
+            br.open(cover)
-        #if self.username is not None and self.password is not None:
+        except:
-        #    br.open('http://www.nytimes.com/auth/login')
+            cover = None
-        #    br.select_form(name='login')
+        return cover
        #    br['USERID']   = self.username
        #    br['PASSWORD'] = self.password
        #    br.submit()
        #return br
    def preprocess_html(self, soup):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            #print 'checking image: ' + iurl
            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
            m = p.search(iurl)
            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['src'] = iurl
            #else:
                #print 'not good'
        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
            iurl = tag['href']
            #print 'checking image: ' + iurl
            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
            m = p.search(iurl)
            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['href'] = iurl
            #else:
                #print 'not good'
        return soup
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def parse_index(self):
-        base = 'http://news.hotpot.hk/fruit'
+        feeds = []
-        soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
+        soup = self.index_to_soup('http://hkm.appledaily.com/')
        ul = soup.find(attrs={'class':'menu'})
        sectionList = []
        for li in ul.findAll('li'):
            a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False)
            title = li.find('a', text=True).strip()
            if not title == u'動新聞':
                sectionList.append((title, a))
        for title, url in sectionList:
            articles = self.parse_section(url)
            if articles:
                feeds.append((title, articles))
        return feeds
-        #def feed_title(div):
+    def parse_section(self, url):
-        #    return ''.join(div.findAll(text=True, recursive=False)).strip()
+        soup = self.index_to_soup(url)
        ul = soup.find(attrs={'class':'list'})
        current_articles = []
        for li in ul.findAll('li'):
            a = li.find('a', href=True)
            title = li.find('p', text=True).strip()
            if a is not None:
                current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
            pass
        return current_articles
-        articles = {}
+    def create_opf(self, feeds, dir=None):
-        key = None
+        if dir is None:
-        ans = []
+            dir = self.output_dir
-        for div in soup.findAll('li'):
+        title = self.short_title()
-            key = div.find(text=True, recursive=True);
+        if self.output_profile.periodical_date_in_title:
-            #if key == u'豪情':
+            title += strftime(self.timefmt)
-           #    continue;
+        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        if self.publication_type:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        mi.timestamp = nowf()
        article_titles, aseen = [], set()
        for f in feeds:
            for a in f:
                if a.title and a.title not in aseen:
                    aseen.add(a.title)
                    article_titles.append(force_unicode(a.title, 'utf-8'))
-            print 'section=' + key
+        mi.comments = self.description
        if not isinstance(mi.comments, unicode):
            mi.comments = mi.comments.decode('utf-8', 'replace')
        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
                '\n\n'.join(article_titles))
-            articles[key] = []
+        language = canonicalize_lang(self.language)
        if language is not None:
            mi.language = language
        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
        # now appears to need the time field to be > 12.00noon as well
        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
-            ans.append(key)
+        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)
-            a = div.find('a', href=True)
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
-            if not a:
+        # Get cover
-                continue
+        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
-            url = base + '/' + a['href']
+        # Get masthead
-            print 'url=' + url
+        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)
-            if not articles.has_key(key):
+        opf.create_manifest_from_files_in(manifest)
-                articles[key] = []
+        for mani in opf.manifest:
-            else:
+            if mani.path.endswith('.ncx'):
-                # sub page
+                mani.id = 'ncx'
-                subSoup = self.index_to_soup(url)
+            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
-                for subDiv in subSoup.findAll('li'):
+        entries = ['index.html']
-                    subA = subDiv.find('a', href=True)
+        toc = TOC(base_path=dir)
-                    subTitle = subDiv.find(text=True, recursive=True)
+        self.play_order_counter = 0
-                    subUrl = base + '/' + subA['href']
+        self.play_order_map = {}
                    print 'subUrl' + subUrl
                    articles[key].append(
                        dict(title=subTitle,
                         url=subUrl,
                         date='',
                         description='',
                         content=''))
-#             elif div['class'] in ['story', 'story headline']:
+        def feed_index(num, parent):
-#                 a = div.find('a', href=True)
+            f = feeds[num]
-#                 if not a:
+            for j, a in enumerate(f):
-#                     continue
+                if getattr(a, 'downloaded', False):
-#                 url = re.sub(r'\?.*', '', a['href'])
+                    adir = 'feed_%d/article_%d/'%(num, j)
-#                 url += '?pagewanted=all'
+                    auth = a.author
-#                 title = self.tag_to_string(a, use_alt=True).strip()
+                    if not auth:
-#                 description = ''
+                        auth = None
-#                 pubdate = strftime('%a, %d %b')
+                    desc = a.text_summary
-#                 summary = div.find(True, attrs={'class':'summary'})
+                    if not desc:
-#                 if summary:
+                        desc = None
-#                     description = self.tag_to_string(summary, use_alt=False)
+                    else:
-#
+                        desc = self.description_limiter(desc)
-#                 feed = key if key is not None else 'Uncategorized'
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
-#                 if not articles.has_key(feed):
+                    entries.append('%sindex.html'%adir)
-#                     articles[feed] = []
+                    po = self.play_order_map.get(entries[-1], None)
-#                 if not 'podcasts' in url:
+                    if po is None:
-#                     articles[feed].append(
+                        self.play_order_counter += 1
-#                               dict(title=title, url=url, date=pubdate,
+                        po = self.play_order_counter
-#                                    description=description,
+                    parent.add_item('%sindex.html'%adir, None,
-#                                    content=''))
+                            a.title if a.title else _('Untitled Article'),
-#        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
+                            play_order=po, author=auth,
-        ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
+                            description=desc, toc_thumbnail=tt)
-        return ans
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp
                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(unicode(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')
        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)
        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
--- a/recipes/singtaohk.recipe
+++ b/recipes/singtaohk.recipe
@ -1,30 +1,30 @@
 # vim:fileencoding=UTF-8
 from __future__ import unicode_literals
 __license__   = 'GPL v3'
-__copyright__ = '2011, Eddie Lau'
+__copyright__ = '2011-2013, Eddie Lau'
 # data source: normal, mobile
 __Source__ = 'mobile'
 # please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
 # Turn below to True if your device supports display of CJK titles (Default: False)
-__UseChineseTitle__ = False
+__UseChineseTitle__ = True
 # Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
 # Set it to True if you want to include a summary in Kindle's article view (Default: False)
-__IncludeSummary__ = False
+__IncludeSummary__ = True
 # Set it to True if you want thumbnail images in Kindle's article view (Default: True)
 __IncludeThumbnails__ = True
 '''
 Change Log:
 2013/03/31 -- fix cover retrieval code and heading size, and remove &nbsp; in summary
 2011/12/29 -- first version done
 TODO:
 * use alternative source at http://m.singtao.com/index.php
 '''
 from calibre.utils.date import now as nowf
 import os, datetime, re
 from datetime import date
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe):
        title   = 'Sing Tao Daily - Hong Kong'
    description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
    category    = 'Chinese, News, Hong Kong'
-    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'
    masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
    if __Source__ == 'normal':
        keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe):
        return self.get_dtlocal().strftime("%d")
    def get_cover_url(self):
-        #cover = 'http://singtao.com/media/a/a(2660).jpg'  # for 2011/12/29
+        soup = self.index_to_soup('http://m.singtao.com/')
-        base = 2660
+        cover = soup.find(attrs={'class':'special'}).get('src', False)
        todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
        diff = todaydate - date(2011, 12, 29)
        base = base + int(diff.total_seconds()/(3600*24))
        cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
-            cover = 'http://singtao.com/images/stlogo.gif'
+            cover = None
        return cover
    def parse_index(self):
@ -289,11 +285,11 @@ class STHKRecipe(BasicNewsRecipe):
                            # the text may or may not be enclosed in <p></p> tag
                            paras = articlebody.findAll('p')
                            if not paras:
-                            	paras = articlebody
+                                paras = articlebody
                            textFound = False
                            for p in paras:
                                if not textFound:
-                                    summary_candidate = self.tag_to_string(p).strip()
+                                    summary_candidate = self.tag_to_string(p).strip().replace('&nbsp;', '')
                                    if len(summary_candidate) > 0:
                                        summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
                                        article.summary = article.text_summary = summary_candidate
@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe):