Merge from trunk

2025-11-24 15:25:01 -05:00 · 2013-04-04 09:27:51 +02:00 · 2013-04-04 09:27:51 +02:00 · 22a1422ec6
commit 22a1422ec6
parent 7b864ff15b 2d4746a39d
11 changed files with 628 additions and 237 deletions
--- a/recipes/am730.recipe
+++ b/recipes/am730.recipe
@ -0,0 +1,290 @@
 # vim:fileencoding=UTF-8
 from __future__ import unicode_literals
 __license__   = 'GPL v3'
 __copyright__ = '2013, Eddie Lau'
 __Date__ = ''
 __HiResImg__ = True
 '''
 Change Log:
 2013/03/30 -- first version
 '''
 from calibre import (__appname__, force_unicode, strftime)
 from calibre.utils.date import now as nowf
 import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.localization import canonicalize_lang
 class AppleDaily(BasicNewsRecipe):
    title          = u'AM730'
    __author__     = 'Eddie Lau'
    publisher      = 'AM730'
    oldest_article = 1
    max_articles_per_feed = 100
    auto_cleanup = False
    language = 'zh'
    encoding = 'utf-8'
    auto_cleanup = False
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    description = 'http://www.am730.com.hk'
    category    = 'Chinese, News, Hong Kong'
    masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
    keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
                      dict(name='div', attrs={'class':'thecontent wordsnap'}),
                      dict(name='a', attrs={'class':'lightboximg'})]
    remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
                   dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        # convert UTC to local hk time - at HKT 6am, all news are available
        return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
    def get_fetchdate(self):
        if __Date__ <> '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")
    def get_fetchformatteddate(self):
        if __Date__ <> '':
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")
    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")
    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")
    def get_fetchday(self):
        if __Date__ <> '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")
    # Note: does not work with custom date given by __Date__
    def get_weekday(self):
        return self.get_dtlocal().weekday()
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def parse_index(self):
        feeds = []
        soup = self.index_to_soup('http://www.am730.com.hk/')
        ul = soup.find(attrs={'class':'nav-section'})
        sectionList = []
        for li in ul.findAll('li'):
            a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
            title = li.find('a').get('title', False).strip()
            sectionList.append((title, a))
        for title, url in sectionList:
            articles = self.parse_section(url)
            if articles:
                feeds.append((title, articles))
        return feeds
    def parse_section(self, url):
        soup = self.index_to_soup(url)
        items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
        current_articles = []
        for item in items:
            a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
            articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
            title = self.tag_to_string(a)
            description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
            current_articles.append({'title': title, 'url': articlelink, 'description': description})
        return current_articles
    def preprocess_html(self, soup):
        multia = soup.findAll('a')
        for a in multia:
            if not (a == None):
                image = a.find('img')
                if not (image == None):
                    if __HiResImg__:
                        image['src'] = image.get('src').replace('/thumbs/', '/')
                    caption = image.get('alt')
                    tag = Tag(soup, "photo", [])
                    tag2 = Tag(soup, "photocaption", [])
                    tag.insert(0, image)
                    if not caption == None:
                        tag2.insert(0, caption)
                        tag.insert(1, tag2)
                    a.replaceWith(tag)
        return soup
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
        title = self.short_title()
        if self.output_profile.periodical_date_in_title:
            title += strftime(self.timefmt)
        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        if self.publication_type:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        mi.timestamp = nowf()
        article_titles, aseen = [], set()
        for f in feeds:
            for a in f:
                if a.title and a.title not in aseen:
                    aseen.add(a.title)
                    article_titles.append(force_unicode(a.title, 'utf-8'))
        mi.comments = self.description
        if not isinstance(mi.comments, unicode):
            mi.comments = mi.comments.decode('utf-8', 'replace')
        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
                '\n\n'.join(article_titles))
        language = canonicalize_lang(self.language)
        if language is not None:
            mi.language = language
        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
        # now appears to need the time field to be > 12.00noon as well
        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)
        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)
        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}
        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(num, j)
                    auth = a.author
                    if not auth:
                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html'%adir, None,
                            a.title if a.title else _('Untitled Article'),
                            play_order=po, author=auth,
                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp
                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(unicode(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')
        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)
        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
--- a/recipes/apple_daily.recipe
+++ b/recipes/apple_daily.recipe
@ -1,161 +1,275 @@
-# -*- coding: utf-8 -*-
+# vim:fileencoding=UTF-8
-import re
+from __future__ import unicode_literals
 __license__   = 'GPL v3'
 __copyright__ = '2013, Eddie Lau'
 __Date__ = ''
 from calibre import (__appname__, force_unicode, strftime)
 from calibre.utils.date import now as nowf
 import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
 from calibre.utils.localization import canonicalize_lang
 class AppleDaily(BasicNewsRecipe):
-
+    title          = u'蘋果日報 (香港)'
-    title       = u'蘋果日報'
+    __author__     = 'Eddie Lau'
-    __author__  = u'蘋果日報'
+    publisher      = '蘋果日報'
-    __publisher__  = u'蘋果日報'
+    oldest_article = 1
-    description = u'蘋果日報'
+    max_articles_per_feed = 100
-    masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
+    auto_cleanup = False
-    language = 'zh_TW'
+    language = 'zh'
-    encoding = 'UTF-8'
+    encoding = 'utf-8'
-    timefmt = ' [%a, %d %b, %Y]'
+    auto_cleanup = False
    needs_subscription = False
    remove_javascript = True
-    remove_tags_before = dict(name=['ul', 'h1'])
+    use_embedded_content   = False
    remove_tags_after  = dict(name='form')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
                dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
                dict(name=['script', 'noscript', 'style', 'form'])]
    no_stylesheets = True
-    extra_css = '''
+    description = 'http://hkm.appledaily.com/'
-    	@font-face {font-family: "uming", serif, sans-serif;  src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
+    category    = 'Chinese, News, Hong Kong'
-	    body {margin-right: 8pt; font-family: 'uming', serif;}
+    masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'
        h1 {font-family: 'uming', serif, sans-serif}
            '''
    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
-    preprocess_regexps = [
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
-       (re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
+    keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
-        lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
+    remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
-    ]
+                   dict(name='p', attrs={'class':'next'})]
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        # convert UTC to local hk time - at HKT 6am, all news are available
        return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
    def get_fetchdate(self):
        if __Date__ <> '':
            return __Date__
        else:
            return self.get_dtlocal().strftime("%Y%m%d")
    def get_fetchformatteddate(self):
        if __Date__ <> '':
            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")
    def get_fetchyear(self):
        if __Date__ <> '':
            return __Date__[0:4]
        else:
            return self.get_dtlocal().strftime("%Y")
    def get_fetchmonth(self):
        if __Date__ <> '':
            return __Date__[4:6]
        else:
            return self.get_dtlocal().strftime("%m")
    def get_fetchday(self):
        if __Date__ <> '':
            return __Date__[6:8]
        else:
            return self.get_dtlocal().strftime("%d")
    # Note: does not work with custom date given by __Date__
    def get_weekday(self):
        return self.get_dtlocal().weekday()
    def get_cover_url(self):
-        return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
+        soup = self.index_to_soup('http://hkm.appledaily.com/')
-
+        cover = soup.find(attrs={'class':'top-news'}).get('src', False)
-
+        br = BasicNewsRecipe.get_browser(self)
-    #def get_browser(self):
+        try:
-        #br = BasicNewsRecipe.get_browser(self)
+            br.open(cover)
-        #if self.username is not None and self.password is not None:
+        except:
-        #    br.open('http://www.nytimes.com/auth/login')
+            cover = None
-        #    br.select_form(name='login')
+        return cover
        #    br['USERID']   = self.username
        #    br['PASSWORD'] = self.password
        #    br.submit()
        #return br
    def preprocess_html(self, soup):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            #print 'checking image: ' + iurl
            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
            m = p.search(iurl)
            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['src'] = iurl
            #else:
                #print 'not good'
        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
            iurl = tag['href']
            #print 'checking image: ' + iurl
            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
            m = p.search(iurl)
            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['href'] = iurl
            #else:
                #print 'not good'
        return soup
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def parse_index(self):
-        base = 'http://news.hotpot.hk/fruit'
+        feeds = []
-        soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
+        soup = self.index_to_soup('http://hkm.appledaily.com/')
        ul = soup.find(attrs={'class':'menu'})
        sectionList = []
        for li in ul.findAll('li'):
            a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False)
            title = li.find('a', text=True).strip()
            if not title == u'動新聞':
                sectionList.append((title, a))
        for title, url in sectionList:
            articles = self.parse_section(url)
            if articles:
                feeds.append((title, articles))
        return feeds
-        #def feed_title(div):
+    def parse_section(self, url):
-        #    return ''.join(div.findAll(text=True, recursive=False)).strip()
+        soup = self.index_to_soup(url)
        ul = soup.find(attrs={'class':'list'})
        current_articles = []
        for li in ul.findAll('li'):
            a = li.find('a', href=True)
            title = li.find('p', text=True).strip()
            if a is not None:
                current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
            pass
        return current_articles
-        articles = {}
+    def create_opf(self, feeds, dir=None):
-        key = None
+        if dir is None:
-        ans = []
+            dir = self.output_dir
-        for div in soup.findAll('li'):
+        title = self.short_title()
-            key = div.find(text=True, recursive=True);
+        if self.output_profile.periodical_date_in_title:
-            #if key == u'豪情':
+            title += strftime(self.timefmt)
-           #    continue;
+        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        if self.publication_type:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        mi.timestamp = nowf()
        article_titles, aseen = [], set()
        for f in feeds:
            for a in f:
                if a.title and a.title not in aseen:
                    aseen.add(a.title)
                    article_titles.append(force_unicode(a.title, 'utf-8'))
-            print 'section=' + key
+        mi.comments = self.description
        if not isinstance(mi.comments, unicode):
            mi.comments = mi.comments.decode('utf-8', 'replace')
        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
                '\n\n'.join(article_titles))
-            articles[key] = []
+        language = canonicalize_lang(self.language)
        if language is not None:
            mi.language = language
        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
        # now appears to need the time field to be > 12.00noon as well
        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
-            ans.append(key)
+        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)
-            a = div.find('a', href=True)
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
-            if not a:
+        # Get cover
-                continue
+        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
-            url = base + '/' + a['href']
+        # Get masthead
-            print 'url=' + url
+        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)
-            if not articles.has_key(key):
+        opf.create_manifest_from_files_in(manifest)
-                articles[key] = []
+        for mani in opf.manifest:
-            else:
+            if mani.path.endswith('.ncx'):
-                # sub page
+                mani.id = 'ncx'
-                subSoup = self.index_to_soup(url)
+            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
-                for subDiv in subSoup.findAll('li'):
+        entries = ['index.html']
-                    subA = subDiv.find('a', href=True)
+        toc = TOC(base_path=dir)
-                    subTitle = subDiv.find(text=True, recursive=True)
+        self.play_order_counter = 0
-                    subUrl = base + '/' + subA['href']
+        self.play_order_map = {}
                    print 'subUrl' + subUrl
                    articles[key].append(
                        dict(title=subTitle,
                         url=subUrl,
                         date='',
                         description='',
                         content=''))
-#             elif div['class'] in ['story', 'story headline']:
+        def feed_index(num, parent):
-#                 a = div.find('a', href=True)
+            f = feeds[num]
-#                 if not a:
+            for j, a in enumerate(f):
-#                     continue
+                if getattr(a, 'downloaded', False):
-#                 url = re.sub(r'\?.*', '', a['href'])
+                    adir = 'feed_%d/article_%d/'%(num, j)
-#                 url += '?pagewanted=all'
+                    auth = a.author
-#                 title = self.tag_to_string(a, use_alt=True).strip()
+                    if not auth:
-#                 description = ''
+                        auth = None
-#                 pubdate = strftime('%a, %d %b')
+                    desc = a.text_summary
-#                 summary = div.find(True, attrs={'class':'summary'})
+                    if not desc:
-#                 if summary:
+                        desc = None
-#                     description = self.tag_to_string(summary, use_alt=False)
+                    else:
-#
+                        desc = self.description_limiter(desc)
-#                 feed = key if key is not None else 'Uncategorized'
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
-#                 if not articles.has_key(feed):
+                    entries.append('%sindex.html'%adir)
-#                     articles[feed] = []
+                    po = self.play_order_map.get(entries[-1], None)
-#                 if not 'podcasts' in url:
+                    if po is None:
-#                     articles[feed].append(
+                        self.play_order_counter += 1
-#                               dict(title=title, url=url, date=pubdate,
+                        po = self.play_order_counter
-#                                    description=description,
+                    parent.add_item('%sindex.html'%adir, None,
-#                                    content=''))
+                            a.title if a.title else _('Untitled Article'),
-#        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
+                            play_order=po, author=auth,
-        ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
+                            description=desc, toc_thumbnail=tt)
-        return ans
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp
                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(unicode(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')
        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                    f.title, play_order=po, description=desc, author=auth))
        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)
        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
--- a/recipes/bwmagazine.recipe
+++ b/recipes/bwmagazine.recipe
@ -37,68 +37,15 @@ class BusinessWeek(BasicNewsRecipe):
                        , 'language'  : language
                        }
    #remove_tags       = [
                           #dict(attrs={'class':'inStory'})
                          #,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
                          #,dict(attrs={'id':['inset','videoDisplay']})
                        #]
    #keep_only_tags    = [dict(name='div', attrs={'id':['story-body','storyBody']})]
    remove_attributes = ['lang']
    match_regexps     = [r'http://www.businessweek.com/.*_page_[1-9].*']
    feeds = [
-              (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'),
+              (u'Top Stories', u'http://www.businessweek.com/feeds/most-popular.rss'),
              (u'Top News'   , u'http://www.businessweek.com/rss/bwdaily.rss'              ),
              (u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
              (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
              (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
              (u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'),
              (u'Europe', u'http://www.businessweek.com/rss/europe.rss'),
              (u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'),
              (u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'),
              (u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'),
              (u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'),
              (u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'),
              (u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'),
              (u'Technology', u'http://www.businessweek.com/rss/technology.rss'),
              (u'Investing', u'http://rss.businessweek.com/bw_rss/investor'),
              (u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'),
              (u'Careers', u'http://rss.businessweek.com/bw_rss/careers'),
              (u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'),
              (u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'),
              (u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'),
            ]
    def get_article_url(self, article):
        url = article.get('guid', None)
        if 'podcasts' in url:
            return None
        if 'surveys' in url:
            return None
        if 'images' in url:
            return None
        if 'feedroom' in url:
            return None
        if '/magazine/toc/' in url:
            return None
        rurl, sep, rest = url.rpartition('?')
        if rurl:
           return rurl
        return rest
    def print_version(self, url):
-        if '/news/' in url or '/blog/ in url':
+        soup = self.index_to_soup(url)
-           return url
+        prntver = soup.find('li', attrs={'class':'print tracked'})
-        rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/')
+        rurl = prntver.find('a', href=True)['href']
-        return rurl.replace('/investing/','/investor/')
+        return rurl
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup
--- a/recipes/list_apart.recipe
+++ b/recipes/list_apart.recipe
@ -1,33 +1,23 @@
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+# vim:fileencoding=UTF-8
 from __future__ import unicode_literals
 from calibre.web.feeds.news import BasicNewsRecipe
 class AListApart (BasicNewsRecipe):
-   __author__ = u'Marc Busqué <marc@lamarciana.com>'
+   __author__ = 'Marc Busqué <marc@lamarciana.com>'
   __url__ = 'http://www.lamarciana.com'
-   __version__ = '1.0'
+   __version__ = '2.0'
   __license__   = 'GPL v3'
-   __copyright__ = u'2012, Marc Busqué <marc@lamarciana.com>'
+   __copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
   title = u'A List Apart'
-   description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.'
+   description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
   language = 'en'
   tags = 'web development, software'
   oldest_article = 120
   remove_empty_feeds = True
   no_stylesheets = True
   encoding = 'utf8'
   cover_url = u'http://alistapart.com/pix/alalogo.gif'
-   keep_only_tags = [
+   extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
         dict(name='div', attrs={'id': 'content'})
         ]
   remove_tags = [
         dict(name='ul', attrs={'id': 'metastuff'}),
         dict(name='div', attrs={'class': 'discuss'}),
         dict(name='div', attrs={'class': 'discuss'}),
         dict(name='div', attrs={'id': 'learnmore'}),
         ]
   remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
   extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}'
   feeds = [
-         (u'A List Apart', u'http://www.alistapart.com/site/rss'),
+         (u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
         ]
--- a/recipes/singtaohk.recipe
+++ b/recipes/singtaohk.recipe
@ -1,30 +1,30 @@
 # vim:fileencoding=UTF-8
 from __future__ import unicode_literals
 __license__   = 'GPL v3'
-__copyright__ = '2011, Eddie Lau'
+__copyright__ = '2011-2013, Eddie Lau'
 # data source: normal, mobile
 __Source__ = 'mobile'
 # please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
 # Turn below to True if your device supports display of CJK titles (Default: False)
-__UseChineseTitle__ = False
+__UseChineseTitle__ = True
 # Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
 # Set it to True if you want to include a summary in Kindle's article view (Default: False)
-__IncludeSummary__ = False
+__IncludeSummary__ = True
 # Set it to True if you want thumbnail images in Kindle's article view (Default: True)
 __IncludeThumbnails__ = True
 '''
 Change Log:
 2013/03/31 -- fix cover retrieval code and heading size, and remove &nbsp; in summary
 2011/12/29 -- first version done
 TODO:
 * use alternative source at http://m.singtao.com/index.php
 '''
 from calibre.utils.date import now as nowf
 import os, datetime, re
 from datetime import date
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe):
        title   = 'Sing Tao Daily - Hong Kong'
    description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
    category    = 'Chinese, News, Hong Kong'
-    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'
    masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
    if __Source__ == 'normal':
        keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe):
        return self.get_dtlocal().strftime("%d")
    def get_cover_url(self):
-        #cover = 'http://singtao.com/media/a/a(2660).jpg'  # for 2011/12/29
+        soup = self.index_to_soup('http://m.singtao.com/')
-        base = 2660
+        cover = soup.find(attrs={'class':'special'}).get('src', False)
        todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
        diff = todaydate - date(2011, 12, 29)
        base = base + int(diff.total_seconds()/(3600*24))
        cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
-            cover = 'http://singtao.com/images/stlogo.gif'
+            cover = None
        return cover
    def parse_index(self):
@ -289,11 +285,11 @@ class STHKRecipe(BasicNewsRecipe):
                            # the text may or may not be enclosed in <p></p> tag
                            paras = articlebody.findAll('p')
                            if not paras:
-                            	paras = articlebody
+                                paras = articlebody
                            textFound = False
                            for p in paras:
                                if not textFound:
-                                    summary_candidate = self.tag_to_string(p).strip()
+                                    summary_candidate = self.tag_to_string(p).strip().replace('&nbsp;', '')
                                    if len(summary_candidate) > 0:
                                        summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
                                        article.summary = article.text_summary = summary_candidate
@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe):
--- a/setup/installer/linux/freeze2.py
+++ b/setup/installer/linux/freeze2.py
@ -47,6 +47,10 @@ binary_includes = [
                '/usr/lib/libgthread-2.0.so.0',
                '/usr/lib/libpng14.so.14',
                '/usr/lib/libexslt.so.0',
                # Ensure that libimobiledevice is compiled against openssl, not gnutls
                '/usr/lib/libimobiledevice.so.3',
                '/usr/lib/libusbmuxd.so.2',
                '/usr/lib/libplist.so.1',
                MAGICK_PREFIX+'/lib/libMagickWand.so.5',
                MAGICK_PREFIX+'/lib/libMagickCore.so.5',
                '/usr/lib/libgcrypt.so.11',
--- a/setup/installer/osx/app/main.py
+++ b/setup/installer/osx/app/main.py
@ -399,7 +399,8 @@ class Py2App(object):
    @flush
    def add_fontconfig(self):
        info('\nAdding fontconfig')
-        for x in ('fontconfig.1', 'freetype.6', 'expat.1'):
+        for x in ('fontconfig.1', 'freetype.6', 'expat.1',
                  'plist.1', 'usbmuxd.2', 'imobiledevice.3'):
            src = os.path.join(SW, 'lib', 'lib'+x+'.dylib')
            self.install_dylib(src)
        dst = os.path.join(self.resources_dir, 'fonts')
--- a/src/calibre/ebooks/oeb/polish/toc.py
+++ b/src/calibre/ebooks/oeb/polish/toc.py
@ -262,6 +262,35 @@ def from_links(container):
            toc.remove(child)
    return toc
 def find_text(node):
    LIMIT = 200
    pat = re.compile(r'\s+')
    for child in node:
        if isinstance(child, etree._Element):
            text = xml2text(child).strip()
            text = pat.sub(' ', text)
            if len(text) < 1:
                continue
            if len(text) > LIMIT:
                # Look for less text in a child of this node, recursively
                ntext = find_text(child)
                return ntext or (text[:LIMIT] + '...')
            else:
                return text
 def from_files(container):
    toc = TOC()
    for spinepath in container.spine_items:
        name = container.abspath_to_name(spinepath)
        root = container.parsed(name)
        body = XPath('//h:body')(root)
        if not body:
            continue
        text = find_text(body[0])
        if text:
            toc.add(text, name)
    return toc
 def add_id(container, name, loc):
    root = container.parsed(name)
    body = root.xpath('//*[local-name()="body"]')[0]
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -333,8 +333,8 @@ class OEBReader(object):
        guide = self.oeb.guide
        manifest = self.oeb.manifest
        for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
-            href = elem.get('href')
+            ref_href = elem.get('href')
-            path = urlnormalize(urldefrag(href)[0])
+            path = urlnormalize(urldefrag(ref_href)[0])
            if path not in manifest.hrefs:
                corrected_href = None
                for href in manifest.hrefs:
@ -342,12 +342,12 @@ class OEBReader(object):
                        corrected_href = href
                        break
                if corrected_href is None:
-                    self.logger.warn(u'Guide reference %r not found' % href)
+                    self.logger.warn(u'Guide reference %r not found' % ref_href)
                    continue
-                href = corrected_href
+                ref_href = corrected_href
            typ = elem.get('type')
            if typ not in guide:
-                guide.add(typ, elem.get('title'), href)
+                guide.add(typ, elem.get('title'), ref_href)
    def _find_ncx(self, opf):
        result = xpath(opf, '/o2:package/o2:spine/@toc')
--- a/src/calibre/gui2/toc/main.py
+++ b/src/calibre/gui2/toc/main.py
@ -18,7 +18,7 @@ from PyQt4.Qt import (QPushButton, QFrame, QVariant, QMenu, QInputDialog,
 from calibre.ebooks.oeb.polish.container import get_container, AZW3Container
 from calibre.ebooks.oeb.polish.toc import (
-    get_toc, add_id, TOC, commit_toc, from_xpaths, from_links)
+    get_toc, add_id, TOC, commit_toc, from_xpaths, from_links, from_files)
 from calibre.gui2 import Application, error_dialog, gprefs
 from calibre.gui2.progress_indicator import ProgressIndicator
 from calibre.gui2.toc.location import ItemEdit
@ -126,6 +126,7 @@ class ItemView(QFrame): # {{{
    go_to_root = pyqtSignal()
    create_from_xpath = pyqtSignal(object)
    create_from_links = pyqtSignal()
    create_from_files = pyqtSignal()
    flatten_toc = pyqtSignal()
    def __init__(self, parent):
@ -183,6 +184,15 @@ class ItemView(QFrame): # {{{
        )))
        l.addWidget(b)
        self.cfb = b = QPushButton(_('Generate ToC from &files'))
        b.clicked.connect(self.create_from_files)
        b.setToolTip(textwrap.fill(_(
            'Generate a Table of Contents from individual files in the book.'
            ' Each entry in the ToC will point to the start of the file, the'
            ' text of the entry will be the "first line" of text from the file.'
        )))
        l.addWidget(b)
        self.xpb = b = QPushButton(_('Generate ToC from &XPath'))
        b.clicked.connect(self.create_from_user_xpath)
        b.setToolTip(textwrap.fill(_(
@ -577,6 +587,7 @@ class TOCView(QWidget): # {{{
        i.add_new_item.connect(self.add_new_item)
        i.create_from_xpath.connect(self.create_from_xpath)
        i.create_from_links.connect(self.create_from_links)
        i.create_from_files.connect(self.create_from_files)
        i.flatten_item.connect(self.flatten_item)
        i.flatten_toc.connect(self.flatten_toc)
        i.go_to_root.connect(self.go_to_root)
@ -778,6 +789,14 @@ class TOCView(QWidget): # {{{
                _('No links were found that could be added to the Table of Contents.'), show=True)
        self.insert_toc_fragment(toc)
    def create_from_files(self):
        toc = from_files(self.ebook)
        if len(toc) == 0:
            return error_dialog(self, _('No items found'),
                _('No files were found that could be added to the Table of Contents.'), show=True)
        self.insert_toc_fragment(toc)
 # }}}
 class TOCEditor(QDialog): # {{{
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
@ -22507,7 +22507,7 @@ msgstr "Autoren beginnend mit '%s'"
 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3477
 #, python-format
 msgid "Authors beginning with '%s'"
-msgstr "Autoren beginnen mit mit %s"
+msgstr "Autoren beginnen mit %s"
 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3518
 msgid "NCX for Recently Added"