Merge from trunk

2025-11-24 15:25:01 -05:00 · 2013-04-04 09:27:51 +02:00 · 2013-04-04 09:27:51 +02:00 · 22a1422ec6
commit 22a1422ec6
parent 7b864ff15b 2d4746a39d
11 changed files with 628 additions and 237 deletions
--- a/recipes/am730.recipe
+++ b/recipes/am730.recipe
@ -0,0 +1,290 @@
+# vim:fileencoding=UTF-8
+from __future__ import unicode_literals
+__license__   = 'GPL v3'
+__copyright__ = '2013, Eddie Lau'
+__Date__ = ''
+__HiResImg__ = True
+
+'''
+Change Log:
+2013/03/30 -- first version
+'''
+
+from calibre import (__appname__, force_unicode, strftime)
+from calibre.utils.date import now as nowf
+import os, datetime, re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from contextlib import nested
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.localization import canonicalize_lang
+
+class AppleDaily(BasicNewsRecipe):
+    title          = u'AM730'
+    __author__     = 'Eddie Lau'
+    publisher      = 'AM730'
+    oldest_article = 1
+    max_articles_per_feed = 100
+    auto_cleanup = False
+    language = 'zh'
+    encoding = 'utf-8'
+    auto_cleanup = False
+    remove_javascript = True
+    use_embedded_content   = False
+    no_stylesheets = True
+    description = 'http://www.am730.com.hk'
+    category    = 'Chinese, News, Hong Kong'
+    masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
+
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
+    keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
+                      dict(name='div', attrs={'class':'thecontent wordsnap'}),
+                      dict(name='a', attrs={'class':'lightboximg'})]
+    remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
+                   dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
+
+    def get_dtlocal(self):
+        dt_utc = datetime.datetime.utcnow()
+        # convert UTC to local hk time - at HKT 6am, all news are available
+        return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
+
+    def get_fetchdate(self):
+        if __Date__ <> '':
+            return __Date__
+        else:
+            return self.get_dtlocal().strftime("%Y%m%d")
+
+    def get_fetchformatteddate(self):
+        if __Date__ <> '':
+            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
+        else:
+            return self.get_dtlocal().strftime("%Y-%m-%d")
+
+    def get_fetchyear(self):
+        if __Date__ <> '':
+            return __Date__[0:4]
+        else:
+            return self.get_dtlocal().strftime("%Y")
+
+    def get_fetchmonth(self):
+        if __Date__ <> '':
+            return __Date__[4:6]
+        else:
+            return self.get_dtlocal().strftime("%m")
+
+    def get_fetchday(self):
+        if __Date__ <> '':
+            return __Date__[6:8]
+        else:
+            return self.get_dtlocal().strftime("%d")
+
+    # Note: does not work with custom date given by __Date__
+    def get_weekday(self):
+        return self.get_dtlocal().weekday()
+
+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+
+    def parse_index(self):
+        feeds = []
+        soup = self.index_to_soup('http://www.am730.com.hk/')
+        ul = soup.find(attrs={'class':'nav-section'})
+        sectionList = []
+        for li in ul.findAll('li'):
+            a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
+            title = li.find('a').get('title', False).strip()
+            sectionList.append((title, a))
+        for title, url in sectionList:
+            articles = self.parse_section(url)
+            if articles:
+                feeds.append((title, articles))
+        return feeds
+
+    def parse_section(self, url):
+        soup = self.index_to_soup(url)
+        items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
+        current_articles = []
+        for item in items:
+            a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
+            articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
+            title = self.tag_to_string(a)
+            description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
+            current_articles.append({'title': title, 'url': articlelink, 'description': description})
+        return current_articles
+
+    def preprocess_html(self, soup):
+        multia = soup.findAll('a')
+        for a in multia:
+            if not (a == None):
+                image = a.find('img')
+                if not (image == None):
+                    if __HiResImg__:
+                        image['src'] = image.get('src').replace('/thumbs/', '/')
+                    caption = image.get('alt')
+                    tag = Tag(soup, "photo", [])
+                    tag2 = Tag(soup, "photocaption", [])
+                    tag.insert(0, image)
+                    if not caption == None:
+                        tag2.insert(0, caption)
+                        tag.insert(1, tag2)
+                    a.replaceWith(tag)
+        return soup
+
+    def create_opf(self, feeds, dir=None):
+        if dir is None:
+            dir = self.output_dir
+        title = self.short_title()
+        if self.output_profile.periodical_date_in_title:
+            title += strftime(self.timefmt)
+        mi = MetaInformation(title, [__appname__])
+        mi.publisher = __appname__
+        mi.author_sort = __appname__
+        if self.publication_type:
+            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        mi.timestamp = nowf()
+        article_titles, aseen = [], set()
+        for f in feeds:
+            for a in f:
+                if a.title and a.title not in aseen:
+                    aseen.add(a.title)
+                    article_titles.append(force_unicode(a.title, 'utf-8'))
+
+        mi.comments = self.description
+        if not isinstance(mi.comments, unicode):
+            mi.comments = mi.comments.decode('utf-8', 'replace')
+        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
+                '\n\n'.join(article_titles))
+
+        language = canonicalize_lang(self.language)
+        if language is not None:
+            mi.language = language
+        # This one affects the pub date shown in kindle title
+        #mi.pubdate = nowf()
+        # now appears to need the time field to be > 12.00noon as well
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
+        opf_path = os.path.join(dir, 'index.opf')
+        ncx_path = os.path.join(dir, 'index.ncx')
+
+        opf = OPFCreator(dir, mi)
+        # Add mastheadImage entry to <guide> section
+        mp = getattr(self, 'masthead_path', None)
+        if mp is not None and os.access(mp, os.R_OK):
+            from calibre.ebooks.metadata.opf2 import Guide
+            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+            ref.type = 'masthead'
+            ref.title = 'Masthead Image'
+            opf.guide.append(ref)
+
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        manifest.append(os.path.join(dir, 'index.html'))
+        manifest.append(os.path.join(dir, 'index.ncx'))
+
+        # Get cover
+        cpath = getattr(self, 'cover_path', None)
+        if cpath is None:
+            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+            if self.default_cover(pf):
+                cpath =  pf.name
+        if cpath is not None and os.access(cpath, os.R_OK):
+            opf.cover = cpath
+            manifest.append(cpath)
+
+        # Get masthead
+        mpath = getattr(self, 'masthead_path', None)
+        if mpath is not None and os.access(mpath, os.R_OK):
+            manifest.append(mpath)
+
+        opf.create_manifest_from_files_in(manifest)
+        for mani in opf.manifest:
+            if mani.path.endswith('.ncx'):
+                mani.id = 'ncx'
+            if mani.path.endswith('mastheadImage.jpg'):
+                mani.id = 'masthead-image'
+
+        entries = ['index.html']
+        toc = TOC(base_path=dir)
+        self.play_order_counter = 0
+        self.play_order_map = {}
+
+
+        def feed_index(num, parent):
+            f = feeds[num]
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(num, j)
+                    auth = a.author
+                    if not auth:
+                        auth = None
+                    desc = a.text_summary
+                    if not desc:
+                        desc = None
+                    else:
+                        desc = self.description_limiter(desc)
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
+                    entries.append('%sindex.html'%adir)
+                    po = self.play_order_map.get(entries[-1], None)
+                    if po is None:
+                        self.play_order_counter += 1
+                        po = self.play_order_counter
+                    parent.add_item('%sindex.html'%adir, None,
+                            a.title if a.title else _('Untitled Article'),
+                            play_order=po, author=auth,
+                            description=desc, toc_thumbnail=tt)
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+                    for sp in a.sub_pages:
+                        prefix = os.path.commonprefix([opf_path, sp])
+                        relp = sp[len(prefix):]
+                        entries.append(relp.replace(os.sep, '/'))
+                        last = sp
+
+                    if os.path.exists(last):
+                        with open(last, 'rb') as fi:
+                            src = fi.read().decode('utf-8')
+                        soup = BeautifulSoup(src)
+                        body = soup.find('body')
+                        if body is not None:
+                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+                            templ = self.navbar.generate(True, num, j, len(f),
+                                            not self.has_single_feed,
+                                            a.orig_url, __appname__, prefix=prefix,
+                                            center=self.center_navbar)
+                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+                            body.insert(len(body.contents), elem)
+                            with open(last, 'wb') as fi:
+                                fi.write(unicode(soup).encode('utf-8'))
+        if len(feeds) == 0:
+            raise Exception('All feeds are empty, aborting.')
+
+        if len(feeds) > 1:
+            for i, f in enumerate(feeds):
+                entries.append('feed_%d/index.html'%i)
+                po = self.play_order_map.get(entries[-1], None)
+                if po is None:
+                    self.play_order_counter += 1
+                    po = self.play_order_counter
+                auth = getattr(f, 'author', None)
+                if not auth:
+                    auth = None
+                desc = getattr(f, 'description', None)
+                if not desc:
+                    desc = None
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+                    f.title, play_order=po, description=desc, author=auth))
+
+        else:
+            entries.append('feed_%d/index.html'%0)
+            feed_index(0, toc)
+
+        for i, p in enumerate(entries):
+            entries[i] = os.path.join(dir, p.replace('/', os.sep))
+        opf.create_spine(entries)
+        opf.set_toc(toc)
+
+        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
+            opf.render(opf_file, ncx_file)
+
--- a/recipes/apple_daily.recipe
+++ b/recipes/apple_daily.recipe
@ -1,161 +1,275 @@
-# -*- coding: utf-8 -*-
-import re
+# vim:fileencoding=UTF-8
+from __future__ import unicode_literals
+__license__   = 'GPL v3'
+__copyright__ = '2013, Eddie Lau'
+__Date__ = ''
+
+from calibre import (__appname__, force_unicode, strftime)
+from calibre.utils.date import now as nowf
+import os, datetime, re
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from contextlib import nested
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.localization import canonicalize_lang

 class AppleDaily(BasicNewsRecipe):
-
-    title       = u'蘋果日報'
-    __author__  = u'蘋果日報'
-    __publisher__  = u'蘋果日報'
-    description = u'蘋果日報'
-    masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
-    language = 'zh_TW'
-    encoding = 'UTF-8'
-    timefmt = ' [%a, %d %b, %Y]'
-    needs_subscription = False
+    title          = u'蘋果日報 (香港)'
+    __author__     = 'Eddie Lau'
+    publisher      = '蘋果日報'
+    oldest_article = 1
+    max_articles_per_feed = 100
+    auto_cleanup = False
+    language = 'zh'
+    encoding = 'utf-8'
+    auto_cleanup = False
    remove_javascript = True
-    remove_tags_before = dict(name=['ul', 'h1'])
-    remove_tags_after  = dict(name='form')
-    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
-                dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
-                dict(name=['script', 'noscript', 'style', 'form'])]
+    use_embedded_content   = False
    no_stylesheets = True
-    extra_css = '''
-    	@font-face {font-family: "uming", serif, sans-serif;  src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
-	    body {margin-right: 8pt; font-family: 'uming', serif;}
-        h1 {font-family: 'uming', serif, sans-serif}
-            '''
-    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
+    description = 'http://hkm.appledaily.com/'
+    category    = 'Chinese, News, Hong Kong'
+    masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'

-    preprocess_regexps = [
-       (re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
-        lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
-    ]
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
+    keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
+    remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
+                   dict(name='p', attrs={'class':'next'})]
+
+    def get_dtlocal(self):
+        dt_utc = datetime.datetime.utcnow()
+        # convert UTC to local hk time - at HKT 6am, all news are available
+        return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
+
+    def get_fetchdate(self):
+        if __Date__ <> '':
+            return __Date__
+        else:
+            return self.get_dtlocal().strftime("%Y%m%d")
+
+    def get_fetchformatteddate(self):
+        if __Date__ <> '':
+            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
+        else:
+            return self.get_dtlocal().strftime("%Y-%m-%d")
+
+    def get_fetchyear(self):
+        if __Date__ <> '':
+            return __Date__[0:4]
+        else:
+            return self.get_dtlocal().strftime("%Y")
+
+    def get_fetchmonth(self):
+        if __Date__ <> '':
+            return __Date__[4:6]
+        else:
+            return self.get_dtlocal().strftime("%m")
+
+    def get_fetchday(self):
+        if __Date__ <> '':
+            return __Date__[6:8]
+        else:
+            return self.get_dtlocal().strftime("%d")
+
+    # Note: does not work with custom date given by __Date__
+    def get_weekday(self):
+        return self.get_dtlocal().weekday()

    def get_cover_url(self):
-        return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
-
-
-    #def get_browser(self):
-        #br = BasicNewsRecipe.get_browser(self)
-        #if self.username is not None and self.password is not None:
-        #    br.open('http://www.nytimes.com/auth/login')
-        #    br.select_form(name='login')
-        #    br['USERID']   = self.username
-        #    br['PASSWORD'] = self.password
-        #    br.submit()
-        #return br
-
-    def preprocess_html(self, soup):
-        #process all the images
-        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
-            iurl = tag['src']
-            #print 'checking image: ' + iurl
-
-            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
-            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
-
-            m = p.search(iurl)
-
-            if m is not None:
-                iurl = 'http://' + m.group('server') + '/' + m.group('path')
-                #print 'working! new url: ' + iurl
-                tag['src'] = iurl
-            #else:
-                #print 'not good'
-
-        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
-            iurl = tag['href']
-            #print 'checking image: ' + iurl
-
-            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
-            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
-
-            m = p.search(iurl)
-
-            if m is not None:
-                iurl = 'http://' + m.group('server') + '/' + m.group('path')
-                #print 'working! new url: ' + iurl
-                tag['href'] = iurl
-            #else:
-                #print 'not good'
-
-        return soup
+        soup = self.index_to_soup('http://hkm.appledaily.com/')
+        cover = soup.find(attrs={'class':'top-news'}).get('src', False)
+        br = BasicNewsRecipe.get_browser(self)
+        try:
+            br.open(cover)
+        except:
+            cover = None
+        return cover

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])

    def parse_index(self):
-        base = 'http://news.hotpot.hk/fruit'
-        soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
+        feeds = []
+        soup = self.index_to_soup('http://hkm.appledaily.com/')
+        ul = soup.find(attrs={'class':'menu'})
+        sectionList = []
+        for li in ul.findAll('li'):
+            a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False)
+            title = li.find('a', text=True).strip()
+            if not title == u'動新聞':
+                sectionList.append((title, a))
+        for title, url in sectionList:
+            articles = self.parse_section(url)
+            if articles:
+                feeds.append((title, articles))
+        return feeds

-        #def feed_title(div):
-        #    return ''.join(div.findAll(text=True, recursive=False)).strip()
+    def parse_section(self, url):
+        soup = self.index_to_soup(url)
+        ul = soup.find(attrs={'class':'list'})
+        current_articles = []
+        for li in ul.findAll('li'):
+            a = li.find('a', href=True)
+            title = li.find('p', text=True).strip()
+            if a is not None:
+                current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
+            pass
+        return current_articles

-        articles = {}
-        key = None
-        ans = []
-        for div in soup.findAll('li'):
-            key = div.find(text=True, recursive=True);
-            #if key == u'豪情':
-           #    continue;
+    def create_opf(self, feeds, dir=None):
+        if dir is None:
+            dir = self.output_dir
+        title = self.short_title()
+        if self.output_profile.periodical_date_in_title:
+            title += strftime(self.timefmt)
+        mi = MetaInformation(title, [__appname__])
+        mi.publisher = __appname__
+        mi.author_sort = __appname__
+        if self.publication_type:
+            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        mi.timestamp = nowf()
+        article_titles, aseen = [], set()
+        for f in feeds:
+            for a in f:
+                if a.title and a.title not in aseen:
+                    aseen.add(a.title)
+                    article_titles.append(force_unicode(a.title, 'utf-8'))

-            print 'section=' + key
+        mi.comments = self.description
+        if not isinstance(mi.comments, unicode):
+            mi.comments = mi.comments.decode('utf-8', 'replace')
+        mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
+                '\n\n'.join(article_titles))

-            articles[key] = []
+        language = canonicalize_lang(self.language)
+        if language is not None:
+            mi.language = language
+        # This one affects the pub date shown in kindle title
+        #mi.pubdate = nowf()
+        # now appears to need the time field to be > 12.00noon as well
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
+        opf_path = os.path.join(dir, 'index.opf')
+        ncx_path = os.path.join(dir, 'index.ncx')

-            ans.append(key)
+        opf = OPFCreator(dir, mi)
+        # Add mastheadImage entry to <guide> section
+        mp = getattr(self, 'masthead_path', None)
+        if mp is not None and os.access(mp, os.R_OK):
+            from calibre.ebooks.metadata.opf2 import Guide
+            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
+            ref.type = 'masthead'
+            ref.title = 'Masthead Image'
+            opf.guide.append(ref)

-            a = div.find('a', href=True)
+        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
+        manifest.append(os.path.join(dir, 'index.html'))
+        manifest.append(os.path.join(dir, 'index.ncx'))

-            if not a:
-                continue
+        # Get cover
+        cpath = getattr(self, 'cover_path', None)
+        if cpath is None:
+            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
+            if self.default_cover(pf):
+                cpath =  pf.name
+        if cpath is not None and os.access(cpath, os.R_OK):
+            opf.cover = cpath
+            manifest.append(cpath)

-            url = base + '/' + a['href']
-            print 'url=' + url
+        # Get masthead
+        mpath = getattr(self, 'masthead_path', None)
+        if mpath is not None and os.access(mpath, os.R_OK):
+            manifest.append(mpath)

-            if not articles.has_key(key):
-                articles[key] = []
+        opf.create_manifest_from_files_in(manifest)
+        for mani in opf.manifest:
+            if mani.path.endswith('.ncx'):
+                mani.id = 'ncx'
+            if mani.path.endswith('mastheadImage.jpg'):
+                mani.id = 'masthead-image'
+
+        entries = ['index.html']
+        toc = TOC(base_path=dir)
+        self.play_order_counter = 0
+        self.play_order_map = {}
+
+
+        def feed_index(num, parent):
+            f = feeds[num]
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(num, j)
+                    auth = a.author
+                    if not auth:
+                        auth = None
+                    desc = a.text_summary
+                    if not desc:
+                        desc = None
                    else:
-                # sub page
-                subSoup = self.index_to_soup(url)
+                        desc = self.description_limiter(desc)
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
+                    entries.append('%sindex.html'%adir)
+                    po = self.play_order_map.get(entries[-1], None)
+                    if po is None:
+                        self.play_order_counter += 1
+                        po = self.play_order_counter
+                    parent.add_item('%sindex.html'%adir, None,
+                            a.title if a.title else _('Untitled Article'),
+                            play_order=po, author=auth,
+                            description=desc, toc_thumbnail=tt)
+                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
+                    for sp in a.sub_pages:
+                        prefix = os.path.commonprefix([opf_path, sp])
+                        relp = sp[len(prefix):]
+                        entries.append(relp.replace(os.sep, '/'))
+                        last = sp

-                for subDiv in subSoup.findAll('li'):
-                    subA = subDiv.find('a', href=True)
-                    subTitle = subDiv.find(text=True, recursive=True)
-                    subUrl = base + '/' + subA['href']
+                    if os.path.exists(last):
+                        with open(last, 'rb') as fi:
+                            src = fi.read().decode('utf-8')
+                        soup = BeautifulSoup(src)
+                        body = soup.find('body')
+                        if body is not None:
+                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+                            templ = self.navbar.generate(True, num, j, len(f),
+                                            not self.has_single_feed,
+                                            a.orig_url, __appname__, prefix=prefix,
+                                            center=self.center_navbar)
+                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+                            body.insert(len(body.contents), elem)
+                            with open(last, 'wb') as fi:
+                                fi.write(unicode(soup).encode('utf-8'))
+        if len(feeds) == 0:
+            raise Exception('All feeds are empty, aborting.')

-                    print 'subUrl' + subUrl
+        if len(feeds) > 1:
+            for i, f in enumerate(feeds):
+                entries.append('feed_%d/index.html'%i)
+                po = self.play_order_map.get(entries[-1], None)
+                if po is None:
+                    self.play_order_counter += 1
+                    po = self.play_order_counter
+                auth = getattr(f, 'author', None)
+                if not auth:
+                    auth = None
+                desc = getattr(f, 'description', None)
+                if not desc:
+                    desc = None
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
+                    f.title, play_order=po, description=desc, author=auth))

-                    articles[key].append(
-                        dict(title=subTitle,
-                         url=subUrl,
-                         date='',
-                         description='',
-                         content=''))
+        else:
+            entries.append('feed_%d/index.html'%0)
+            feed_index(0, toc)

+        for i, p in enumerate(entries):
+            entries[i] = os.path.join(dir, p.replace('/', os.sep))
+        opf.create_spine(entries)
+        opf.set_toc(toc)

-#             elif div['class'] in ['story', 'story headline']:
-#                 a = div.find('a', href=True)
-#                 if not a:
-#                     continue
-#                 url = re.sub(r'\?.*', '', a['href'])
-#                 url += '?pagewanted=all'
-#                 title = self.tag_to_string(a, use_alt=True).strip()
-#                 description = ''
-#                 pubdate = strftime('%a, %d %b')
-#                 summary = div.find(True, attrs={'class':'summary'})
-#                 if summary:
-#                     description = self.tag_to_string(summary, use_alt=False)
-#
-#                 feed = key if key is not None else 'Uncategorized'
-#                 if not articles.has_key(feed):
-#                     articles[feed] = []
-#                 if not 'podcasts' in url:
-#                     articles[feed].append(
-#                               dict(title=title, url=url, date=pubdate,
-#                                    description=description,
-#                                    content=''))
-#        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
-        ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
-        return ans
-
+        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
+            opf.render(opf_file, ncx_file)

--- a/recipes/bwmagazine.recipe
+++ b/recipes/bwmagazine.recipe
@ -37,68 +37,15 @@ class BusinessWeek(BasicNewsRecipe):
                        , 'language'  : language
                        }

-    #remove_tags       = [
-                           #dict(attrs={'class':'inStory'})
-                          #,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
-                          #,dict(attrs={'id':['inset','videoDisplay']})
-                        #]
-    #keep_only_tags    = [dict(name='div', attrs={'id':['story-body','storyBody']})]
-    remove_attributes = ['lang']
-    match_regexps     = [r'http://www.businessweek.com/.*_page_[1-9].*']
-

    feeds = [
-              (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'),
-              (u'Top News'   , u'http://www.businessweek.com/rss/bwdaily.rss'              ),
-              (u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
-              (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
-              (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
-              (u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'),
-              (u'Europe', u'http://www.businessweek.com/rss/europe.rss'),
-              (u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'),
-              (u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'),
-              (u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'),
-              (u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'),
-              (u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'),
-              (u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'),
-              (u'Technology', u'http://www.businessweek.com/rss/technology.rss'),
-              (u'Investing', u'http://rss.businessweek.com/bw_rss/investor'),
-              (u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'),
-              (u'Careers', u'http://rss.businessweek.com/bw_rss/careers'),
-              (u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'),
-              (u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'),
-              (u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'),
+              (u'Top Stories', u'http://www.businessweek.com/feeds/most-popular.rss'),
            ]

-    def get_article_url(self, article):
-        url = article.get('guid', None)
-        if 'podcasts' in url:
-            return None
-        if 'surveys' in url:
-            return None
-        if 'images' in url:
-            return None
-        if 'feedroom' in url:
-            return None
-        if '/magazine/toc/' in url:
-            return None
-        rurl, sep, rest = url.rpartition('?')
-        if rurl:
-           return rurl
-        return rest
-
    def print_version(self, url):
-        if '/news/' in url or '/blog/ in url':
-           return url
-        rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/')
-        return rurl.replace('/investing/','/investor/')
+        soup = self.index_to_soup(url)
+        prntver = soup.find('li', attrs={'class':'print tracked'})
+        rurl = prntver.find('a', href=True)['href']
+        return rurl

-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        for alink in soup.findAll('a'):
-            if alink.string is not None:
-               tstr = alink.string
-               alink.replaceWith(tstr)
-        return soup

--- a/recipes/list_apart.recipe
+++ b/recipes/list_apart.recipe
@ -1,33 +1,23 @@
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+# vim:fileencoding=UTF-8
+from __future__ import unicode_literals
 from calibre.web.feeds.news import BasicNewsRecipe

 class AListApart (BasicNewsRecipe):
-   __author__ = u'Marc Busqué <marc@lamarciana.com>'
+   __author__ = 'Marc Busqué <marc@lamarciana.com>'
   __url__ = 'http://www.lamarciana.com'
-   __version__ = '1.0'
+   __version__ = '2.0'
   __license__   = 'GPL v3'
-   __copyright__ = u'2012, Marc Busqué <marc@lamarciana.com>'
+   __copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
   title = u'A List Apart'
-   description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.'
+   description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
   language = 'en'
   tags = 'web development, software'
   oldest_article = 120
   remove_empty_feeds = True
-   no_stylesheets = True
   encoding = 'utf8'
   cover_url = u'http://alistapart.com/pix/alalogo.gif'
-   keep_only_tags = [
-         dict(name='div', attrs={'id': 'content'})
-         ]
-   remove_tags = [
-         dict(name='ul', attrs={'id': 'metastuff'}),
-         dict(name='div', attrs={'class': 'discuss'}),
-         dict(name='div', attrs={'class': 'discuss'}),
-         dict(name='div', attrs={'id': 'learnmore'}),
-         ]
-   remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
-   extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}'
+   extra_css = u'img {max-width: 100%; display: block; margin: auto;}'

   feeds = [
-         (u'A List Apart', u'http://www.alistapart.com/site/rss'),
+         (u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
         ]
--- a/recipes/singtaohk.recipe
+++ b/recipes/singtaohk.recipe
@ -1,30 +1,30 @@
+# vim:fileencoding=UTF-8
+from __future__ import unicode_literals
 __license__   = 'GPL v3'
-__copyright__ = '2011, Eddie Lau'
+__copyright__ = '2011-2013, Eddie Lau'

 # data source: normal, mobile
 __Source__ = 'mobile'
 # please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
 # Turn below to True if your device supports display of CJK titles (Default: False)
-__UseChineseTitle__ = False
+__UseChineseTitle__ = True
 # Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
 # Set it to True if you want to include a summary in Kindle's article view (Default: False)
-__IncludeSummary__ = False
+__IncludeSummary__ = True
 # Set it to True if you want thumbnail images in Kindle's article view (Default: True)
 __IncludeThumbnails__ = True


 '''
 Change Log:
+2013/03/31 -- fix cover retrieval code and heading size, and remove &nbsp; in summary
 2011/12/29 -- first version done
-TODO:
-* use alternative source at http://m.singtao.com/index.php
 '''

 from calibre.utils.date import now as nowf
 import os, datetime, re
-from datetime import date
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe):
        title   = 'Sing Tao Daily - Hong Kong'
    description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
    category    = 'Chinese, News, Hong Kong'
-    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
+    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'
    masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
    if __Source__ == 'normal':
        keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe):
        return self.get_dtlocal().strftime("%d")

    def get_cover_url(self):
-        #cover = 'http://singtao.com/media/a/a(2660).jpg'  # for 2011/12/29
-        base = 2660
-        todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
-        diff = todaydate - date(2011, 12, 29)
-        base = base + int(diff.total_seconds()/(3600*24))
-        cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
+        soup = self.index_to_soup('http://m.singtao.com/')
+        cover = soup.find(attrs={'class':'special'}).get('src', False)
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
-            cover = 'http://singtao.com/images/stlogo.gif'
+            cover = None
        return cover

    def parse_index(self):
@ -293,7 +289,7 @@ class STHKRecipe(BasicNewsRecipe):
                            textFound = False
                            for p in paras:
                                if not textFound:
-                                    summary_candidate = self.tag_to_string(p).strip()
+                                    summary_candidate = self.tag_to_string(p).strip().replace('&nbsp;', '')
                                    if len(summary_candidate) > 0:
                                        summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
                                        article.summary = article.text_summary = summary_candidate
@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe):



+
--- a/setup/installer/linux/freeze2.py
+++ b/setup/installer/linux/freeze2.py
@ -47,6 +47,10 @@ binary_includes = [
                '/usr/lib/libgthread-2.0.so.0',
                '/usr/lib/libpng14.so.14',
                '/usr/lib/libexslt.so.0',
+                # Ensure that libimobiledevice is compiled against openssl, not gnutls
+                '/usr/lib/libimobiledevice.so.3',
+                '/usr/lib/libusbmuxd.so.2',
+                '/usr/lib/libplist.so.1',
                MAGICK_PREFIX+'/lib/libMagickWand.so.5',
                MAGICK_PREFIX+'/lib/libMagickCore.so.5',
                '/usr/lib/libgcrypt.so.11',
--- a/setup/installer/osx/app/main.py
+++ b/setup/installer/osx/app/main.py
@ -399,7 +399,8 @@ class Py2App(object):
    @flush
    def add_fontconfig(self):
        info('\nAdding fontconfig')
-        for x in ('fontconfig.1', 'freetype.6', 'expat.1'):
+        for x in ('fontconfig.1', 'freetype.6', 'expat.1',
+                  'plist.1', 'usbmuxd.2', 'imobiledevice.3'):
            src = os.path.join(SW, 'lib', 'lib'+x+'.dylib')
            self.install_dylib(src)
        dst = os.path.join(self.resources_dir, 'fonts')
--- a/src/calibre/ebooks/oeb/polish/toc.py
+++ b/src/calibre/ebooks/oeb/polish/toc.py
@ -262,6 +262,35 @@ def from_links(container):
            toc.remove(child)
    return toc

+def find_text(node):
+    LIMIT = 200
+    pat = re.compile(r'\s+')
+    for child in node:
+        if isinstance(child, etree._Element):
+            text = xml2text(child).strip()
+            text = pat.sub(' ', text)
+            if len(text) < 1:
+                continue
+            if len(text) > LIMIT:
+                # Look for less text in a child of this node, recursively
+                ntext = find_text(child)
+                return ntext or (text[:LIMIT] + '...')
+            else:
+                return text
+
+def from_files(container):
+    toc = TOC()
+    for spinepath in container.spine_items:
+        name = container.abspath_to_name(spinepath)
+        root = container.parsed(name)
+        body = XPath('//h:body')(root)
+        if not body:
+            continue
+        text = find_text(body[0])
+        if text:
+            toc.add(text, name)
+    return toc
+
 def add_id(container, name, loc):
    root = container.parsed(name)
    body = root.xpath('//*[local-name()="body"]')[0]
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -333,8 +333,8 @@ class OEBReader(object):
        guide = self.oeb.guide
        manifest = self.oeb.manifest
        for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
-            href = elem.get('href')
-            path = urlnormalize(urldefrag(href)[0])
+            ref_href = elem.get('href')
+            path = urlnormalize(urldefrag(ref_href)[0])
            if path not in manifest.hrefs:
                corrected_href = None
                for href in manifest.hrefs:
@ -342,12 +342,12 @@ class OEBReader(object):
                        corrected_href = href
                        break
                if corrected_href is None:
-                    self.logger.warn(u'Guide reference %r not found' % href)
+                    self.logger.warn(u'Guide reference %r not found' % ref_href)
                    continue
-                href = corrected_href
+                ref_href = corrected_href
            typ = elem.get('type')
            if typ not in guide:
-                guide.add(typ, elem.get('title'), href)
+                guide.add(typ, elem.get('title'), ref_href)

    def _find_ncx(self, opf):
        result = xpath(opf, '/o2:package/o2:spine/@toc')
--- a/src/calibre/gui2/toc/main.py
+++ b/src/calibre/gui2/toc/main.py
@ -18,7 +18,7 @@ from PyQt4.Qt import (QPushButton, QFrame, QVariant, QMenu, QInputDialog,

 from calibre.ebooks.oeb.polish.container import get_container, AZW3Container
 from calibre.ebooks.oeb.polish.toc import (
-    get_toc, add_id, TOC, commit_toc, from_xpaths, from_links)
+    get_toc, add_id, TOC, commit_toc, from_xpaths, from_links, from_files)
 from calibre.gui2 import Application, error_dialog, gprefs
 from calibre.gui2.progress_indicator import ProgressIndicator
 from calibre.gui2.toc.location import ItemEdit
@ -126,6 +126,7 @@ class ItemView(QFrame): # {{{
    go_to_root = pyqtSignal()
    create_from_xpath = pyqtSignal(object)
    create_from_links = pyqtSignal()
+    create_from_files = pyqtSignal()
    flatten_toc = pyqtSignal()

    def __init__(self, parent):
@ -183,6 +184,15 @@ class ItemView(QFrame): # {{{
        )))
        l.addWidget(b)

+        self.cfb = b = QPushButton(_('Generate ToC from &files'))
+        b.clicked.connect(self.create_from_files)
+        b.setToolTip(textwrap.fill(_(
+            'Generate a Table of Contents from individual files in the book.'
+            ' Each entry in the ToC will point to the start of the file, the'
+            ' text of the entry will be the "first line" of text from the file.'
+        )))
+        l.addWidget(b)
+
        self.xpb = b = QPushButton(_('Generate ToC from &XPath'))
        b.clicked.connect(self.create_from_user_xpath)
        b.setToolTip(textwrap.fill(_(
@ -577,6 +587,7 @@ class TOCView(QWidget): # {{{
        i.add_new_item.connect(self.add_new_item)
        i.create_from_xpath.connect(self.create_from_xpath)
        i.create_from_links.connect(self.create_from_links)
+        i.create_from_files.connect(self.create_from_files)
        i.flatten_item.connect(self.flatten_item)
        i.flatten_toc.connect(self.flatten_toc)
        i.go_to_root.connect(self.go_to_root)
@ -778,6 +789,14 @@ class TOCView(QWidget): # {{{
                _('No links were found that could be added to the Table of Contents.'), show=True)
        self.insert_toc_fragment(toc)

+    def create_from_files(self):
+        toc = from_files(self.ebook)
+        if len(toc) == 0:
+            return error_dialog(self, _('No items found'),
+                _('No files were found that could be added to the Table of Contents.'), show=True)
+        self.insert_toc_fragment(toc)
+
+
 # }}}

 class TOCEditor(QDialog): # {{{
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
@ -22507,7 +22507,7 @@ msgstr "Autoren beginnend mit '%s'"
 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3477
 #, python-format
 msgid "Authors beginning with '%s'"
-msgstr "Autoren beginnen mit mit %s"
+msgstr "Autoren beginnen mit %s"

 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3518
 msgid "NCX for Recently Added"