Merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-12-16 09:24:15 +01:00 · 2011-12-16 09:24:15 +01:00 · f9b6ce9470
commit f9b6ce9470
parent 72f6b0385d b857cebfb0
156 changed files with 54033 additions and 20129 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -2,6 +2,8 @@
 .check-cache.pickle
 src/calibre/plugins
 resources/images.qrc
+src/calibre/ebooks/oeb/display/test/*.js
+resources/display/*.js
 src/calibre/manual/.build/
 src/calibre/manual/cli/
 src/calibre/manual/template_ref.rst
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -19,6 +19,65 @@
 #   new recipes:
 #     - title: 

+- version: 0.8.31
+  date: 2011-12-16
+
+  new features:
+    - title: "Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness."
+      tickets: [901466]
+
+    - title: "Driver for PocketBook 611 and Lenovo IdeaPad"
+
+    - title: "Allow customization of the order in which custom column editing is performed in the edit metadata dialog. Setting is available via Preferences->Tweaks."
+      tickets: [902731]
+
+    - title: "MOBI news download: Allow recipes to set a thumbnail for entries in the periodical table of contents. Currently used by the NYTimes, WSJ, Independent, GUardian and Globe and Mail recipes"
+      tickets: [900130]
+
+    - title: "E-book viewer: Add an option to the right click menu to search for the currently selected word"
+
+    - title: "Automatically hide the no internet connection available error message if the connection is restored before the user clicks OK"
+ 
+  bug fixes:
+    - title: "Fix comments not hidden in Book details panel when they are turned off via Preferences->Look & Feel->Book Details"
+
+    - title: "E-book viewer: Do not popup an error message if the user tries to use the mouse wheel to scroll before a document is loaded."
+      tickets: [903449] 
+
+    - title: "Add docx to the list of ebook extensions."
+      tickets: [903452]
+
+    - title: "When downloading metadata from non-English Amazon websites, do not correct the case of book titles."
+
+    - title: "Fix regression in 0.8.30 that broke bulk conversion of a single book." 
+      tickets: [902506]
+
+    - title: "When minimized to system tray do not display the no internet connection error as a dialog box, instead use a system tray notification"
+
+    - title: "Catalog generation: Include the series_index field for custom series columns as well"
+
+    - title: "Comic Input: Do not rescale images when using the Tablet output profile (or any output profile with a screen size larger than 3000x3000)"
+
+    - title: "HTML Input: Ignore unparseable URLs instead of crashing on them."
+      tickets: [902372] 
+
+
+  improved recipes:
+    - La Republica
+    - CND
+    - Berliner Zeitung
+    - Zaman Gazetesi
+
+  new recipes:
+    - title: CND Weekly
+      author: Derek Liang
+
+    - title: descopera.org 
+      author: Marius Ignatescu
+
+    - title: Rynek Zdrowia 
+      author: spi630
+
 - version: 0.8.30
  date: 2011-12-09

--- a/recipes/berliner_zeitung.recipe
+++ b/recipes/berliner_zeitung.recipe
@ -1,61 +1,44 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
-import re
+
+'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''

 class SportsIllustratedRecipe(BasicNewsRecipe) :
-    __author__    = 'ape'
-    __copyright__ = 'ape'
+    __author__    = 'a.peter'
+    __copyright__ = 'a.peter'
    __license__   = 'GPL v3'
    language      = 'de'
-    description   = 'Berliner Zeitung'
-    version       = 2
+    description   = 'Berliner Zeitung RSS'
+    version       = 4
    title         = u'Berliner Zeitung'
    timefmt       = ' [%d.%m.%Y]'

+    #oldest_article = 7.0
    no_stylesheets = True
    remove_javascript = True
    use_embedded_content = False
    publication_type = 'newspaper'

-    keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
+    remove_tags_before = dict(name='div', attrs={'class':'newstype'})
+    remove_tags_after = [dict(id='article_text')]

-    INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
-
-    def parse_index(self):
-        base = 'http://www.berlinonline.de'
-        answer = []
-        articles = {}
-        more = 1
-
-        soup = self.index_to_soup(self.INDEX)
-
-        # Get list of links to ressorts from index page
-        ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
-        for ressort in ressort_list[0].findAll('a'):
-            feed_title = ressort.string
-            print 'Analyzing', feed_title
-            if not articles.has_key(feed_title):
-                articles[feed_title] = []
-                answer.append(feed_title)
-            # Load ressort page.
-            feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
-            # find mainbar div which contains the list of all articles
-            for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
-                # iterate over all articles
-                for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
-                    # extract title of article
-                    if article_teaser.h3 != None:
-                        article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url'  : base + article_teaser.h3.a['href'], 'description' : u''}
-                        articles[feed_title].append(article)
-                    else:
-                        # Skip teasers for missing photos
-                        if article_teaser.div.p.contents[0].find('Foto:') > -1:
-                            continue
-                        article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
-                        articles[feed_title].append(article)
-                        more += 1
-        answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
-        return answer
+    feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
+             (u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
+             (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
+             (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
+             (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
+             (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
+             (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
+             (u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
+             (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
+             (u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
+             (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
+             (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
+             (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
+             (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
+             (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]

    def get_masthead_url(self):
-        return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
+        return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'

+    def print_version(self, url):
+        return url.replace('.html', ',view,printVersion.html')
--- a/recipes/cnd.recipe
+++ b/recipes/cnd.recipe
@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
 	remove_tags		= [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
 	no_stylesheets	 = True

-	preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
+	preprocess_regexps = [  (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
+				(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
+				]

 	def print_version(self, url):
 		if url.find('news/article.php') >= 0:
@ -46,16 +48,18 @@ class TheCND(BasicNewsRecipe):
 			title = self.tag_to_string(a)
 			self.log('\tFound article: ', title, 'at', url)
 			date = a.nextSibling
+			if re.search('cm', date):
+				continue
 			if (date is not None) and len(date)>2:
 				if not articles.has_key(date):
 					articles[date] = []
 				articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
 				self.log('\t\tAppend to : ', date)

-		self.log('log articles', articles)
+		#self.log('log articles', articles)
 		mostCurrent = sorted(articles).pop()
-		self.title = 'CND ' + mostCurrent
-
+		self.title = 'CND ' + mostCurrent		
+		
 		feeds.append((self.title, articles[mostCurrent]))

 		return feeds
--- a/recipes/cnd_weekly.recipe
+++ b/recipes/cnd_weekly.recipe
@ -0,0 +1,72 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
+'''
+cnd.org
+'''
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheCND(BasicNewsRecipe):
+
+	title	  = 'CND Weekly'
+	__author__ = 'Derek Liang'
+	description = ''
+	INDEX = 'http://cnd.org'
+	language = 'zh'
+	conversion_options = {'linearize_tables':True}
+
+	remove_tags_before = dict(name='div', id='articleHead')
+	remove_tags_after  = dict(id='copyright')
+	remove_tags		= [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
+	no_stylesheets	 = True
+
+	preprocess_regexps = [  (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
+				(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
+				]
+
+	def print_version(self, url):
+		if url.find('news/article.php') >= 0:
+			return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
+		else:
+			return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
+
+	def parse_index(self):
+		soup = self.index_to_soup(self.INDEX)
+
+		feeds = []
+		articles = {}
+
+		for a in soup.findAll('a', attrs={'target':'_cnd'}):
+			url = a['href']
+			if url.find('article.php') < 0 :
+				continue
+			if url.startswith('/'):
+				url = 'http://cnd.org'+url
+			title = self.tag_to_string(a)
+			date = a.nextSibling
+			if not re.search('cm', date):
+				continue
+			self.log('\tFound article: ', title, 'at', url, '@', date)
+			if (date is not None) and len(date)>2:
+				if not articles.has_key(date):
+					articles[date] = []
+				articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
+				self.log('\t\tAppend to : ', date)
+
+		
+		sorted_articles = sorted(articles)
+		while sorted_articles:
+			mostCurrent = sorted_articles.pop()
+			self.title = 'CND ' + mostCurrent
+			feeds.append((self.title, articles[mostCurrent]))
+
+		return feeds
+
+	def populate_article_metadata(self, article, soup, first):
+		header = soup.find('h3')
+		self.log('header: ' + self.tag_to_string(header))
+		pass
+
--- a/recipes/descopera_org.recipe
+++ b/recipes/descopera_org.recipe
@ -1,27 +1,27 @@
-# -*- coding: utf-8 -*-
-'''
-descopera.org
-'''
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class Descopera(BasicNewsRecipe):
-    title = u'Descoperă.org'
-    __author__  = 'Marius Ignătescu'
-    description = 'Descoperă. Placerea de a cunoaște'
-    publisher = 'descopera.org'
-    category = 'science, technology, culture, history, earth'
-    language = 'ro'
-    oldest_article = 14
-    max_articles_per_feed = 100
-    encoding = 'utf8'
-    no_stylesheets = True
-    extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
-    keep_only_tags    = [dict(name='div', attrs={'class':['post']})]
-    remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
-    remove_attributes = ['width','height']
-    cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
-    feeds  = [(u'Articles', u'http://www.descopera.org/feed/')]
-
-    def preprocess_html(self, soup):
-        return self.adeify_images(soup)
+# -*- coding: utf-8 -*-
+'''
+descopera.org
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Descopera(BasicNewsRecipe):
+    title = u'Descoperă.org'
+    __author__  = 'Marius Ignătescu'
+    description = 'Descoperă. Placerea de a cunoaște'
+    publisher = 'descopera.org'
+    category = 'science, technology, culture, history, earth'
+    language = 'ro'
+    oldest_article = 14
+    max_articles_per_feed = 100
+    encoding = 'utf8'
+    no_stylesheets = True
+    extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
+    keep_only_tags    = [dict(name='div', attrs={'class':['post']})]
+    remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
+    remove_attributes = ['width','height']
+    cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
+    feeds  = [(u'Articles', u'http://www.descopera.org/feed/')]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/recipes/globe_and_mail.recipe
+++ b/recipes/globe_and_mail.recipe
@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
            {'class':['articleTools', 'pagination', 'Ads', 'topad',
                'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+
+
    #Use the mobile version rather than the web version
    def print_version(self, url):
        return url.rpartition('?')[0] + '?service=mobile'
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@ -79,6 +79,12 @@ class Guardian(BasicNewsRecipe):
              url = None
          return url

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+
    def preprocess_html(self, soup):

          # multiple html sections in soup, useful stuff in the first
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -104,6 +104,13 @@ class TheIndependentNew(BasicNewsRecipe):
            url = None
        return url

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+ 
+
    def preprocess_html(self, soup):

        #remove 'advertorial articles'
--- a/recipes/la_republica.recipe
+++ b/recipes/la_republica.recipe
@ -1,13 +1,12 @@
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
 __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
-description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
+description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'

 '''
 http://www.repubblica.it/
 '''

-import re
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe

@ -25,27 +24,21 @@ class LaRepubblica(BasicNewsRecipe):
    use_embedded_content    = False
    no_stylesheets          = True
    publication_type        = 'newspaper'
-    articles_are_obfuscated = True
-    temp_files              = []
+    articles_are_obfuscated = True    
+    temp_files              = []    
    extra_css               = """
                               img{display: block}
                              """
-
+                           
    remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
-
-    preprocess_regexps = [
-        (re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
-        (re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
-        (re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
-    ]
-
+    
    def get_article_url(self, article):
        link = BasicNewsRecipe.get_article_url(self, article)
        if link and not '.repubblica.it/' in link:
            link2 = article.get('id', article.get('guid', None))
            if link2:
                link = link2
-        return link.rpartition('?')[0]
+        return link.rpartition('?')[0]        

    def get_obfuscated_article(self, url):
        count = 0
@ -56,12 +49,12 @@ class LaRepubblica(BasicNewsRecipe):
                count = 10
            except:
                print "Retrying download..."
-            count += 1
+            count += 1        
        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()
        return self.temp_files[-1].name
-
+        
    keep_only_tags     = [
                          dict(attrs={'class':'articolo'}),
                          dict(attrs={'class':'body-text'}),
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
    remove_tags        = [
                            dict(name=['object','link','meta','iframe','embed']),
                            dict(name='span',attrs={'class':'linkindice'}),
-                            dict(name='div', attrs={'class':'bottom-mobile'}),
-                            dict(name='div', attrs={'id':['rssdiv','blocco']}),
-                            dict(name='div', attrs={'class':'utility'}),
+                            dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
+                            dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
+                            dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
                            dict(name='div', attrs={'class':'generalbox'}),
                            dict(name='ul', attrs={'id':'hystory'})
                         ]

    feeds          = [
-                       (u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
+                       (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
                       (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
                       (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
                       (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@ -105,8 +98,10 @@ class LaRepubblica(BasicNewsRecipe):
    def preprocess_html(self, soup):
        for item in soup.findAll(['hgroup','deresponsabilizzazione','per']):
            item.name = 'div'
-            item.attrs = []
+            item.attrs = []            
        for item in soup.findAll(style=True):
-            del item['style']
+            del item['style']           
        return soup
-
+                      
+    def preprocess_raw_html(self, raw, url):
+       return '<html><head>'+raw[raw.find('</head>'):]
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -1,5 +1,5 @@
 #!/usr/bin/env  python
-
+# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
@ -707,6 +707,16 @@ class NYTimes(BasicNewsRecipe):
 		return soup

    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
+            if idxdiv is not None:
+                if idxdiv.img:
+                    self.add_toc_thumbnail(article, idxdiv.img['src'])
+            else:
+                img = soup.find('img')
+                if img is not None:
+                    self.add_toc_thumbnail(article, img['src'])
+
        shortparagraph = ""
        try:
            if len(article.text_summary.strip()) == 0:
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -855,6 +855,16 @@ class NYTimes(BasicNewsRecipe):

        return soup
    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
+            if idxdiv is not None:
+                if idxdiv.img:
+                    self.add_toc_thumbnail(article, idxdiv.img['src'])
+            else:
+                img = soup.find('img')
+                if img is not None:
+                    self.add_toc_thumbnail(article, img['src'])
+
        shortparagraph = ""
        try:
            if len(article.text_summary.strip()) == 0:
--- a/recipes/sueddeutsche.recipe
+++ b/recipes/sueddeutsche.recipe
@ -12,39 +12,39 @@ class Sueddeutsche(BasicNewsRecipe):

    title = u'sueddeutsche.de'
    description = 'News from Germany'
-    __author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-11-25
+    __author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-12-16
    use_embedded_content   = False
    timefmt = ' [%d %b %Y]'
-    oldest_article = 7
-    max_articles_per_feed = 50
+    oldest_article = 1#7
+    max_articles_per_feed = 2#50
    no_stylesheets = True
    language = 'de'
-
+    auto_cleanup = True
    encoding = 'utf-8'
    remove_javascript = True
-    cover_url  = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1219199.1322239289!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-11-25 AGe
-
-    remove_tags = [ dict(name='link'), dict(name='iframe'),
-                    dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD",
-                          "SKY_AD","NT1_AD","navbar1","sdesiteheader"]}),
-
-                    dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg",
-                                 "pages closed","basebox right narrow","headslot galleried"]}),
-
-                    dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2",
-                             "item","videoBigButton","articlefooter full-column",
-                                                     "bildbanderolle full-column","footerCopy padleft5"]}),
-
-                    dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}),
-                    dict(name='div', attrs={'style':["position:relative;"]}),
-                    dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}),
-                    dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
-                    dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}),
-                    dict(name='td', attrs={'class':["artikelDruckenRight"]}),
-                    dict(name='p', text = "ANZEIGE")
-                     ]
-    remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})]
-
+    cover_url  = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1236175.1323967473!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-12-16 AGe
+# 2011-12-16 AGe
+#    remove_tags = [ dict(name='link'), dict(name='iframe'),
+#                    dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD",
+#                          "SKY_AD","NT1_AD","navbar1","sdesiteheader"]}),
+#
+#                    dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg",
+#                                 "pages closed","basebox right narrow","headslot galleried"]}),
+#
+#                    dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2",
+#                             "item","videoBigButton","articlefooter full-column",
+#                                                     "bildbanderolle full-column","footerCopy padleft5"]}),
+#
+#                    dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}),
+#                    dict(name='div', attrs={'style':["position:relative;"]}),
+#                    dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}),
+#                    dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
+#                    dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}),
+#                    dict(name='td', attrs={'class':["artikelDruckenRight"]}),
+#                    dict(name='p', text = "ANZEIGE")
+#                     ]
+#    remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})]
+#
    extra_css = '''
                    h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;}
                    a{font-family:Arial,Helvetica,sans-serif; font-style:italic;}
@ -53,30 +53,45 @@ class Sueddeutsche(BasicNewsRecipe):
                    .artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; }
                    body{font-family:Arial,Helvetica,sans-serif; }
                    .photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;}                 '''
-
+#
    feeds = [
-              (u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'),
-              (u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'),
-              (u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'),
-              (u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'),
-              (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
-              (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
-              (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
-              (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13
-              (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
-              (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
-              (u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'),
-              (u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'),
-              (u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'),
-              (u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'),
-              (u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'),
-              (u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only
-              (u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'),     # sometimes only
-              (u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'),         # sometimes only
-              (u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only
-              (u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'),   # sometimes only
+#              (u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13
+#              (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'), #AGe 2011-12-16 deactivated
+#              (u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
+#              (u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'),     # sometimes only #AGe 2011-12-16 deactivated
+#              (u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'),         # sometimes only #AGe 2011-12-16 deactivated
+#              (u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
+#              (u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'),   # sometimes only #AGe 2011-12-16 deactivated
+              (u'Politik', u'http://www.sueddeutsche.de/app/service/rss/ressort/politik/rss.xml'),
+              (u'Wirtschaft', u'http://www.sueddeutsche.de/app/service/rss/ressort/wirtschaft/rss.xml'),
+              (u'Geld', u'http://www.sueddeutsche.de/app/service/rss/ressort/finanzen/rss.xml'),
+              (u'Kultur', u'http://www.sueddeutsche.de/app/service/rss/ressort/kultur/rss.xml'),
+              (u'Sport', u'http://www.sueddeutsche.de/app/service/rss/ressort/sport/rss.xml'),
+              (u'Leben', u'http://www.sueddeutsche.de/app/service/rss/ressort/leben/rss.xml'),
+              (u'Karriere', u'http://www.sueddeutsche.de/app/service/rss/ressort/karriere/rss.xml'),
+              (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'),
+              (u'Bayern', u'http://www.sueddeutsche.de/app/service/rss/ressort/bayern/rss.xml'),
+              (u'Medien', u'http://www.sueddeutsche.de/app/service/rss/ressort/medien/rss.xml'),
+              (u'Digital', u'http://www.sueddeutsche.de/app/service/rss/ressort/computerwissen/rss.xml'),
+              (u'Auto', u'http://www.sueddeutsche.de/app/service/rss/ressort/autoreise/rss.xml'),
+              (u'Wissen', u'http://www.sueddeutsche.de/app/service/rss/ressort/wissen/rss.xml'),
+              (u'Panorama', u'http://www.sueddeutsche.de/app/service/rss/ressort/panorama/rss.xml'),
+              (u'Reise', u'http://www.sueddeutsche.de/app/service/rss/ressort/reise/rss.xml'),
            ]

-    def print_version(self, url):
-        main, sep, id = url.rpartition('/')
-        return main + '/2.220/' + id
+#    def print_version(self, url):             #AGe 2011-12-16 deactivated
+#        main, sep, id = url.rpartition('/')   #AGe 2011-12-16 deactivated
+#        return main + '/2.220/' + id          #AGe 2011-12-16 deactivated
--- a/recipes/telegraph_uk.recipe
+++ b/recipes/telegraph_uk.recipe
@ -59,6 +59,11 @@ class TelegraphUK(BasicNewsRecipe):
                        ,(u'Travel'        , u'http://www.telegraph.co.uk/travel/rss'                                            )
                        ,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss'                     )
                         ]
+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])

    def get_article_url(self, article):
        url = article.get('link', None)
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -57,6 +57,12 @@ class WallStreetJournal(BasicNewsRecipe):
                        'username and password')
        return br

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(name=['table', 'tr', 'td']):
            tag.name = 'div'
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@ -44,6 +44,12 @@ class WallStreetJournal(BasicNewsRecipe):
                    ]
    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(name=['table', 'tr', 'td']):
            tag.name = 'div'
--- a/session.vim
+++ b/session.vim
@ -1,5 +1,5 @@
 " Project wide builtins
-let g:pyflakes_builtins = ["_", "dynamic_property", "__", "P", "I", "lopen", "icu_lower", "icu_upper", "icu_title", "ngettext"]
+let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext"

 python << EOFPY
 import os, sys
--- a/setup/commands.py
+++ b/setup/commands.py
@ -11,7 +11,7 @@ __all__ = [
        'build', 'build_pdf2xml', 'server',
        'gui',
        'develop', 'install',
-        'kakasi', 'resources',
+        'kakasi', 'coffee', 'resources',
        'check',
        'sdist',
        'manual', 'tag_release',
@ -49,9 +49,10 @@ gui = GUI()
 from setup.check import Check
 check = Check()

-from setup.resources import Resources, Kakasi
+from setup.resources import Resources, Kakasi, Coffee
 resources = Resources()
 kakasi = Kakasi()
+coffee = Coffee()

 from setup.publish import Manual, TagRelease, Stage1, Stage2, \
        Stage3, Stage4, Stage5, Publish
--- a/setup/iso_639/ca.po
+++ b/setup/iso_639/ca.po
@ -12,14 +12,14 @@ msgstr ""
 "Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
 "devel@lists.alioth.debian.org>\n"
 "POT-Creation-Date: 2011-11-25 14:01+0000\n"
-"PO-Revision-Date: 2011-11-22 16:45+0000\n"
+"PO-Revision-Date: 2011-12-14 19:48+0000\n"
 "Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
 "Language-Team: Catalan <linux@softcatala.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-11-26 05:10+0000\n"
-"X-Generator: Launchpad (build 14381)\n"
+"X-Launchpad-Export-Date: 2011-12-15 05:18+0000\n"
+"X-Generator: Launchpad (build 14487)\n"
 "Language: ca\n"

 #. name for aaa
@ -9348,7 +9348,7 @@ msgstr "Seit-Kaitetu"

 #. name for hil
 msgid "Hiligaynon"
-msgstr ""
+msgstr "Hiligainon"

 #. name for hin
 msgid "Hindi"
@ -9356,39 +9356,39 @@ msgstr "Hindi"

 #. name for hio
 msgid "Tsoa"
-msgstr ""
+msgstr "Tsoa"

 #. name for hir
 msgid "Himarimã"
-msgstr ""
+msgstr "Himarimà"

 #. name for hit
 msgid "Hittite"
-msgstr ""
+msgstr "Hittita"

 #. name for hiw
 msgid "Hiw"
-msgstr ""
+msgstr "Hiw"

 #. name for hix
 msgid "Hixkaryána"
-msgstr ""
+msgstr "Hishkaryana"

 #. name for hji
 msgid "Haji"
-msgstr ""
+msgstr "Aji"

 #. name for hka
 msgid "Kahe"
-msgstr ""
+msgstr "Kahe"

 #. name for hke
 msgid "Hunde"
-msgstr ""
+msgstr "Hunde"

 #. name for hkk
 msgid "Hunjara-Kaina Ke"
-msgstr ""
+msgstr "Hunjara"

 #. name for hks
 msgid "Hong Kong Sign Language"
@ -9396,27 +9396,27 @@ msgstr "Llenguatge de signes de Hong Kong"

 #. name for hla
 msgid "Halia"
-msgstr ""
+msgstr "Halia"

 #. name for hlb
 msgid "Halbi"
-msgstr ""
+msgstr "Halbi"

 #. name for hld
 msgid "Halang Doan"
-msgstr ""
+msgstr "Halang Doan"

 #. name for hle
 msgid "Hlersu"
-msgstr ""
+msgstr "Sansu"

 #. name for hlt
 msgid "Nga La"
-msgstr ""
+msgstr "Nga La"

 #. name for hlu
 msgid "Luwian; Hieroglyphic"
-msgstr ""
+msgstr "Luvi; jeroglífic"

 #. name for hma
 msgid "Miao; Southern Mashan"
@ -9424,7 +9424,7 @@ msgstr "Miao; Mashan meridional"

 #. name for hmb
 msgid "Songhay; Humburi Senni"
-msgstr ""
+msgstr "Songhai; central"

 #. name for hmc
 msgid "Miao; Central Huishui"
@ -9440,11 +9440,11 @@ msgstr "Miao; Huishui oriental"

 #. name for hmf
 msgid "Hmong Don"
-msgstr ""
+msgstr "Miao; Don"

 #. name for hmg
 msgid "Hmong; Southwestern Guiyang"
-msgstr ""
+msgstr "Miao; Guiyang sudoccidental"

 #. name for hmh
 msgid "Miao; Southwestern Huishui"
@ -9456,11 +9456,11 @@ msgstr "Miao; Huishui septentrional"

 #. name for hmj
 msgid "Ge"
-msgstr ""
+msgstr "Ge"

 #. name for hmk
 msgid "Maek"
-msgstr ""
+msgstr "Maek"

 #. name for hml
 msgid "Miao; Luopohe"
@ -9472,11 +9472,11 @@ msgstr "Miao; Mashan central"

 #. name for hmn
 msgid "Hmong"
-msgstr ""
+msgstr "Hmong (macrollengua)"

 #. name for hmo
 msgid "Hiri Motu"
-msgstr ""
+msgstr "Hiri Motu"

 #. name for hmp
 msgid "Miao; Northern Mashan"
@ -9488,7 +9488,7 @@ msgstr "Miao; Qiandong oriental"

 #. name for hmr
 msgid "Hmar"
-msgstr ""
+msgstr "Hmar"

 #. name for hms
 msgid "Miao; Southern Qiandong"
@ -9496,15 +9496,15 @@ msgstr "Miao; Qiandong meridional"

 #. name for hmt
 msgid "Hamtai"
-msgstr ""
+msgstr "Hamtai"

 #. name for hmu
 msgid "Hamap"
-msgstr ""
+msgstr "Hamap"

 #. name for hmv
 msgid "Hmong Dô"
-msgstr ""
+msgstr "Miao; Do"

 #. name for hmw
 msgid "Miao; Western Mashan"
@ -9520,19 +9520,19 @@ msgstr "Miao; Shua"

 #. name for hna
 msgid "Mina (Cameroon)"
-msgstr ""
+msgstr "Mina (Camerun)"

 #. name for hnd
 msgid "Hindko; Southern"
-msgstr ""
+msgstr "Hindko; meridional"

 #. name for hne
 msgid "Chhattisgarhi"
-msgstr ""
+msgstr "Chattisgarbi"

 #. name for hnh
 msgid "//Ani"
-msgstr ""
+msgstr "Ani"

 #. name for hni
 msgid "Hani"
@ -9540,7 +9540,7 @@ msgstr ""

 #. name for hnj
 msgid "Hmong Njua"
-msgstr ""
+msgstr "Miao; Hmong Njua"

 #. name for hnn
 msgid "Hanunoo"
@ -9548,7 +9548,7 @@ msgstr ""

 #. name for hno
 msgid "Hindko; Northern"
-msgstr ""
+msgstr "Hindko; septentrional"

 #. name for hns
 msgid "Hindustani; Caribbean"
@ -11800,7 +11800,7 @@ msgstr ""

 #. name for khq
 msgid "Songhay; Koyra Chiini"
-msgstr ""
+msgstr "Songhai; Koyra"

 #. name for khr
 msgid "Kharia"
@ -17288,7 +17288,7 @@ msgstr ""

 #. name for mww
 msgid "Hmong Daw"
-msgstr ""
+msgstr "Miao; blanc"

 #. name for mwx
 msgid "Mediak"
@ -28680,7 +28680,7 @@ msgstr ""

 #. name for xlu
 msgid "Luwian; Cuneiform"
-msgstr ""
+msgstr "Luvi; cuneïforme"

 #. name for xly
 msgid "Elymian"
--- a/setup/resources.py
+++ b/setup/resources.py
@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, cPickle, re, shutil, marshal, zipfile, glob
+import os, cPickle, re, shutil, marshal, zipfile, glob, subprocess, time
 from zlib import compress

 from setup import Command, basenames, __appname__
@ -23,7 +23,70 @@ def get_opts_from_parser(parser):
        for o in g.option_list:
            for x in do_opt(o): yield x

-class Kakasi(Command):
+class Coffee(Command): # {{{
+
+    description = 'Compile coffeescript files into javascript'
+    COFFEE_DIRS = {'ebooks/oeb/display': 'display'}
+
+    def add_options(self, parser):
+        parser.add_option('--watch', '-w', action='store_true', default=False,
+                help='Autocompile when .coffee files are changed')
+        parser.add_option('--show-js', action='store_true', default=False,
+                help='Display the generated javascript')
+
+    def run(self, opts):
+        self.do_coffee_compile(opts)
+        if opts.watch:
+            try:
+                while True:
+                    time.sleep(0.5)
+                    self.do_coffee_compile(opts, timestamp=True,
+                            ignore_errors=True)
+            except KeyboardInterrupt:
+                pass
+
+    def show_js(self, jsfile):
+        from pygments.lexers import JavascriptLexer
+        from pygments.formatters import TerminalFormatter
+        from pygments import highlight
+        with open(jsfile, 'rb') as f:
+            raw = f.read()
+        print highlight(raw, JavascriptLexer(), TerminalFormatter())
+
+    def do_coffee_compile(self, opts, timestamp=False, ignore_errors=False):
+        for toplevel, dest in self.COFFEE_DIRS.iteritems():
+            dest = self.j(self.RESOURCES, dest)
+            for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
+                js = self.j(dest, os.path.basename(x.rpartition('.')[0]+'.js'))
+                if self.newer(js, x):
+                    print ('\t%sCompiling %s'%(time.strftime('[%H:%M:%S] ') if
+                        timestamp else '', os.path.basename(x)))
+                    try:
+                        subprocess.check_call(['coffee', '-c', '-o', dest, x])
+                    except:
+                        print ('\n\tCompilation of %s failed'%os.path.basename(x))
+                        if ignore_errors:
+                            with open(js, 'wb') as f:
+                                f.write('# Compilation from coffeescript failed')
+                        else:
+                            raise SystemExit(1)
+                    else:
+                        if opts.show_js:
+                            self.show_js(js)
+                            print ('#'*80)
+                            print ('#'*80)
+
+    def clean(self):
+        for toplevel, dest in self.COFFEE_DIRS.iteritems():
+            dest = self.j(self.RESOURCES, dest)
+            for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
+                x = x.rpartition('.')[0] + '.js'
+                x = self.j(dest, os.path.basename(x))
+                if os.path.exists(x):
+                    os.remove(x)
+# }}}
+
+class Kakasi(Command): # {{{

    description = 'Compile resources for unihandecode'

@ -62,9 +125,6 @@ class Kakasi(Command):
            self.info('\tGenerating kanadict')
            self.mkkanadict(src, dest)

-        return
-
-
    def mkitaiji(self, src, dst):
        dic = {}
        for line in open(src, "r"):
@ -125,11 +185,12 @@ class Kakasi(Command):
        kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
        if os.path.exists(kakasi):
            shutil.rmtree(kakasi)
+# }}}

-class Resources(Command):
+class Resources(Command): # {{{

    description = 'Compile various needed calibre resources'
-    sub_commands = ['kakasi']
+    sub_commands = ['kakasi', 'coffee']

    def run(self, opts):
        scripts = {}
@ -223,13 +284,13 @@ class Resources(Command):
            x = self.j(self.RESOURCES, x+'.pickle')
            if os.path.exists(x):
                os.remove(x)
-        from setup.commands import kakasi
+        from setup.commands import kakasi, coffee
        kakasi.clean()
+        coffee.clean()
        for x in ('builtin_recipes.xml', 'builtin_recipes.zip',
                'template-functions.json'):
            x = self.j(self.RESOURCES, x)
            if os.path.exists(x):
                os.remove(x)
-
-
+# }}}

--- a/setup/translations.py
+++ b/setup/translations.py
@ -215,32 +215,34 @@ class GetTranslations(Translations): # {{{
    description = 'Get updated translations from Launchpad'
    BRANCH = 'lp:~kovid/calibre/translations'

-    @classmethod
-    def modified_translations(cls):
-        raw = subprocess.Popen(['bzr', 'status'],
+    @property
+    def modified_translations(self):
+        raw = subprocess.Popen(['bzr', 'status', '-S', self.PATH],
                stdout=subprocess.PIPE).stdout.read().strip()
+        ans = []
        for line in raw.splitlines():
            line = line.strip()
-            if line.startswith(cls.PATH) and line.endswith('.po'):
-                yield line
+            if line.startswith('M') and line.endswith('.po'):
+                ans.append(line.split()[-1])
+        return ans

    def run(self, opts):
-        if len(list(self.modified_translations())) == 0:
+        if not self.modified_translations:
            subprocess.check_call(['bzr', 'merge', self.BRANCH])
-        if len(list(self.modified_translations())) == 0:
-            print 'No updated translations available'
-        else:
-            subprocess.check_call(['bzr', 'commit', '-m',
-                'IGN:Updated translations', self.PATH])
        self.check_for_errors()

-    @classmethod
-    def check_for_errors(cls):
+        if self.modified_translations:
+            subprocess.check_call(['bzr', 'commit', '-m',
+                'IGN:Updated translations', self.PATH])
+        else:
+            print('No updated translations available')
+
+    def check_for_errors(self):
        errors = os.path.join(tempfile.gettempdir(), 'calibre-translation-errors')
        if os.path.exists(errors):
            shutil.rmtree(errors)
        os.mkdir(errors)
-        pofilter = ('pofilter', '-i', cls.PATH, '-o', errors,
+        pofilter = ('pofilter', '-i', self.PATH, '-o', errors,
                '-t', 'accelerators', '-t', 'escapes', '-t', 'variables',
                #'-t', 'xmltags',
                #'-t', 'brackets',
@ -253,23 +255,20 @@ class GetTranslations(Translations): # {{{
                '-t', 'printf')
        subprocess.check_call(pofilter)
        errfiles = glob.glob(errors+os.sep+'*.po')
-        subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles)
-        for f in errfiles:
-            with open(f, 'r+b') as f:
-                raw = f.read()
-                raw = re.sub(r'# \(pofilter\).*', '', raw)
-                f.seek(0)
-                f.truncate()
-                f.write(raw)
+        if errfiles:
+            subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles)
+            for f in errfiles:
+                with open(f, 'r+b') as f:
+                    raw = f.read()
+                    raw = re.sub(r'# \(pofilter\).*', '', raw)
+                    f.seek(0)
+                    f.truncate()
+                    f.write(raw)

-        subprocess.check_call(['pomerge', '-t', cls.PATH, '-i', errors, '-o',
-            cls.PATH])
-        if len(list(cls.modified_translations())) > 0:
-            subprocess.call(['bzr', 'diff', cls.PATH])
-            yes = raw_input('Merge corrections? [y/n]: ').strip()
-            if yes in ['', 'y']:
-                subprocess.check_call(['bzr', 'commit', '-m',
-                    'IGN:Translation corrections', cls.PATH])
+            subprocess.check_call(['pomerge', '-t', self.PATH, '-i', errors, '-o',
+                self.PATH])
+            return True
+        return False

 # }}}

--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -558,11 +558,11 @@ xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = {
    '>' : '&gt;',
    '&' : '&amp;'})

-def replace_entities(raw):
-    return _ent_pat.sub(entity_to_unicode, raw)
+def replace_entities(raw, encoding='cp1252'):
+    return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)

-def xml_replace_entities(raw):
-    return _ent_pat.sub(xml_entity_to_unicode, raw)
+def xml_replace_entities(raw, encoding='cp1252'):
+    return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)

 def prepare_string_for_xml(raw, attribute=False):
    raw = _ent_pat.sub(entity_to_unicode, raw)
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -4,7 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = u'calibre'
-numeric_version = (0, 8, 30)
+numeric_version = (0, 8, 31)
 __version__   = u'.'.join(map(unicode, numeric_version))
 __author__    = u"Kovid Goyal <kovid@kovidgoyal.net>"

--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@ -173,8 +173,9 @@ class INVESBOOK(EB600):
    FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'pdf', 'rtf', 'txt']
    BCD         = [0x110, 0x323]

-    VENDOR_NAME = ['INVES_E6', 'INVES-WI']
-    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK']
+    VENDOR_NAME = ['INVES_E6', 'INVES-WI', 'POCKETBO']
+    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK',
+            'OK_POCKET_611_61']

 class BOOQ(EB600):
    name = 'Booq Device Interface'
--- a/src/calibre/ebooks/init.py
+++ b/src/calibre/ebooks/init.py
@ -30,7 +30,7 @@ BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'ht
                   'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
                   'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
                   'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
-                   'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi']
+                   'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx']

 class HTMLRenderer(object):

--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@ -229,7 +229,10 @@ class EPUBOutput(OutputFormatPlugin):
            if opts.extract_to is not None:
                from calibre.utils.zipfile import ZipFile
                if os.path.exists(opts.extract_to):
-                    shutil.rmtree(opts.extract_to)
+                    if os.path.isdir(opts.extract_to):
+                        shutil.rmtree(opts.extract_to)
+                    else:
+                        os.remove(opts.extract_to)
                os.mkdir(opts.extract_to)
                with ZipFile(output_path) as zf:
                    zf.extractall(path=opts.extract_to)
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -16,7 +16,8 @@ from lxml.html import tostring

 from calibre import as_unicode
 from calibre.ebooks.metadata import check_isbn
-from calibre.ebooks.metadata.sources.base import Source, Option
+from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
+        fixauthors)
 from calibre.utils.cleantext import clean_ascii_chars
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata.book.base import Metadata
@ -509,6 +510,15 @@ class Amazon(Source):

        return domain

+    def clean_downloaded_metadata(self, mi):
+        if mi.title and self.domain in ('com', 'uk'):
+            mi.title = fixcase(mi.title)
+        mi.authors = fixauthors(mi.authors)
+        if self.domain in ('com', 'uk'):
+            mi.tags = list(map(fixcase, mi.tags))
+        mi.isbn = check_isbn(mi.isbn)
+
+
    def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
            domain=None):
        if domain is None:
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -31,7 +31,7 @@ class TOC(list):

    def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
                 base_path=os.getcwd(), type='unknown', author=None,
-                 description=None):
+                 description=None, toc_thumbnail=None):
        self.href = href
        self.fragment = fragment
        if not self.fragment:
@ -43,6 +43,7 @@ class TOC(list):
        self.type = type
        self.author = author
        self.description = description
+        self.toc_thumbnail = toc_thumbnail

    def __str__(self):
        lines = ['TOC: %s#%s'%(self.href, self.fragment)]
@ -72,12 +73,12 @@ class TOC(list):
        entry.parent = None

    def add_item(self, href, fragment, text, play_order=None, type='unknown',
-            author=None, description=None):
+            author=None, description=None, toc_thumbnail=None):
        if play_order is None:
            play_order = (self[-1].play_order if len(self) else self.play_order) + 1
        self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
                        base_path=self.base_path, play_order=play_order,
-                        type=type, author=author, description=description))
+                        type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
        return self[-1]

    def top_level_items(self):
@ -269,6 +270,9 @@ class TOC(list):
            if desc:
                desc = re.sub(r'\s+', ' ', desc)
                elem.append(C.meta(desc, name='description'))
+            idx = getattr(np, 'toc_thumbnail', None)
+            if idx:
+                elem.append(C.meta(idx, name='toc_thumbnail'))   
            parent.append(elem)
            for np2 in np:
                navpoint(elem, np2)
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -656,11 +656,11 @@ class Tag(object): # {{{
                        ' image record associated with this article',
                        'image_index'),
                    70 : ('Description offset in cncx', 'desc_offset'),
-                    71 : ('Image attribution offset in cncx',
-                        'image_attr_offset'),
+                    71 : ('Author offset in cncx', 'author_offset'),
                    72 : ('Image caption offset in cncx',
                        'image_caption_offset'),
-                    73 : ('Author offset in cncx', 'author_offset'),
+                    73 : ('Image attribution offset in cncx',
+                        'image_attr_offset'),
            },

            'chapter_with_subchapters' : {
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -973,7 +973,8 @@ class MobiReader(object):
                continue
            processed_records.append(i)
            data  = self.sections[i][0]
-            if data[:4] in (b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n'):
+            if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
+                    b'RESC', b'BOUN', b'FDST', b'DATP'}:
                # A FLIS, FCIS, SRCS or EOF record, ignore
                continue
            buf = cStringIO.StringIO(data)
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -136,7 +136,8 @@ class IndexEntry(object):
            'last_child_index': 23,
            'image_index': 69,
            'desc_offset': 70,
-            'author_offset': 73,
+            'author_offset': 71,
+
    }
    RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}

@ -754,6 +755,13 @@ class Indexer(object): # {{{
                normalized_articles.append(article)
                article.author_offset = self.cncx[art.author]
                article.desc_offset = self.cncx[art.description]
+                if getattr(art, 'toc_thumbnail', None) is not None:
+                    try:
+                        ii = self.serializer.images[art.toc_thumbnail] - 1
+                        if ii > -1:
+                            article.image_index = ii
+                    except KeyError:
+                        pass # Image not found in serializer

            if normalized_articles:
                normalized_articles.sort(key=lambda x:x.offset)
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -161,7 +161,7 @@ class MobiWriter(object):
        index = 1

        mh_href = None
-        if 'masthead' in oeb.guide:
+        if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
            mh_href = oeb.guide['masthead'].href
            self.image_records.append(None)
            index += 1
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -16,15 +16,13 @@ from urllib import unquote as urlunquote
 from lxml import etree, html
 from calibre.constants import filesystem_encoding, __version__
 from calibre.translations.dynamic import translate
-from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
-from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
+from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.conversion.preprocess import CSSPreProcessor
-from calibre import isbytestring, as_unicode, get_types_map
-
-RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
+from calibre import (isbytestring, as_unicode, get_types_map)
+from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
+        namespace, XHTML, parse_html, NotHTML)

 XML_NS       = 'http://www.w3.org/XML/1998/namespace'
-XHTML_NS     = 'http://www.w3.org/1999/xhtml'
 OEB_DOC_NS   = 'http://openebook.org/namespaces/oeb-document/1.0/'
 OPF1_NS      = 'http://openebook.org/namespaces/oeb-package/1.0/'
 OPF2_NS      = 'http://www.idpf.org/2007/opf'
@ -55,9 +53,6 @@ OPF2_NSMAP   = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
 def XML(name):
    return '{%s}%s' % (XML_NS, name)

-def XHTML(name):
-    return '{%s}%s' % (XHTML_NS, name)
-
 def OPF(name):
    return '{%s}%s' % (OPF2_NS, name)

@ -279,22 +274,11 @@ PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
 CSSURL_RE     = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')

-
 def element(parent, *args, **kwargs):
    if parent is not None:
        return etree.SubElement(parent, *args, **kwargs)
    return etree.Element(*args, **kwargs)

-def namespace(name):
-    if '}' in name:
-        return name.split('}', 1)[0][1:]
-    return ''
-
-def barename(name):
-    if '}' in name:
-        return name.split('}', 1)[1]
-    return name
-
 def prefixname(name, nsrmap):
    if not isqname(name):
        return name
@ -373,25 +357,6 @@ def urlnormalize(href):
    parts = (urlquote(part) for part in parts)
    return urlunparse(parts)

-def merge_multiple_html_heads_and_bodies(root, log=None):
-    heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
-    if not (len(heads) > 1 or len(bodies) > 1): return root
-    for child in root: root.remove(child)
-    head = root.makeelement(XHTML('head'))
-    body = root.makeelement(XHTML('body'))
-    for h in heads:
-        for x in h:
-            head.append(x)
-    for b in bodies:
-        for x in b:
-            body.append(x)
-    map(root.append, (head, body))
-    if log is not None:
-        log.warn('Merging multiple <head> and <body> sections')
-    return root
-
-
-


 class DummyHandler(logging.Handler):
@ -418,10 +383,6 @@ class OEBError(Exception):
    """Generic OEB-processing error."""
    pass

-class NotHTML(OEBError):
-    '''Raised when a file that should be HTML (as per manifest) is not'''
-    pass
-
 class NullContainer(object):
    """An empty container.

@ -801,7 +762,6 @@ class Manifest(object):
        """

        NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
-        META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')

        def __init__(self, oeb, id, href, media_type,
                     fallback=None, loader=str, data=None):
@ -830,244 +790,17 @@ class Manifest(object):
                return None
            return etree.fromstring(data, parser=RECOVER_PARSER)

-        def clean_word_doc(self, data):
-            prefixes = []
-            for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
-                prefixes.append(match.group(1))
-            if prefixes:
-                self.oeb.log.warn('Found microsoft markup, cleaning...')
-                # Remove empty tags as they are not rendered by browsers
-                # but can become renderable HTML tags like <p/> if the
-                # document is parsed by an HTML parser
-                pat = re.compile(
-                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
-                        re.DOTALL)
-                data = pat.sub('', data)
-                pat = re.compile(
-                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
-                data = pat.sub('', data)
-            return data
-
        def _parse_xhtml(self, data):
            orig_data = data
-            self.oeb.log.debug('Parsing', self.href, '...')
-            # Convert to Unicode and normalize line endings
-            data = self.oeb.decode(data)
-            data = strip_encoding_declarations(data)
-            data = self.oeb.html_preprocessor(data)
-            # There could be null bytes in data if it had &#0; entities in it
-            data = data.replace('\0', '')
-
-            # Remove DOCTYPE declaration as it messes up parsing
-            # In particular, it causes tostring to insert xmlns
-            # declarations, which messes up the coercing logic
-            idx = data.find('<html')
-            if idx == -1:
-                idx = data.find('<HTML')
-            if idx > -1:
-                pre = data[:idx]
-                data = data[idx:]
-                if '<!DOCTYPE' in pre:
-                    user_entities = {}
-                    for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
-                        val = match.group(2)
-                        if val.startswith('"') and val.endswith('"'):
-                            val = val[1:-1]
-                        user_entities[match.group(1)] = val
-                    if user_entities:
-                        pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
-                        data = pat.sub(lambda m:user_entities[m.group(1)], data)
-
-            # Setting huge_tree=True causes crashes in windows with large files
-            parser = etree.XMLParser(no_network=True)
-            # Try with more & more drastic measures to parse
-            def first_pass(data):
-                try:
-                    data = etree.fromstring(data, parser=parser)
-                except etree.XMLSyntaxError as err:
-                    self.oeb.log.debug('Initial parse failed, using more'
-                            ' forgiving parsers')
-                    repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
-                    data = ENTITY_RE.sub(repl, data)
-                    try:
-                        data = etree.fromstring(data, parser=parser)
-                    except etree.XMLSyntaxError as err:
-                        self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
-                        if err.args and err.args[0].startswith('Excessive depth'):
-                            from calibre.utils.soupparser import fromstring
-                            data = fromstring(data)
-                        else:
-                            data = html.fromstring(data)
-                        data.attrib.pop('xmlns', None)
-                        for elem in data.iter(tag=etree.Comment):
-                            if elem.text:
-                                elem.text = elem.text.strip('-')
-                        data = etree.tostring(data, encoding=unicode)
-                        try:
-                            data = etree.fromstring(data, parser=parser)
-                        except etree.XMLSyntaxError:
-                            data = etree.fromstring(data, parser=RECOVER_PARSER)
-                return data
+            fname = urlunquote(self.href)
+            self.oeb.log.debug('Parsing', fname, '...')
            try:
-                data = self.clean_word_doc(data)
-            except:
-                pass
-            data = first_pass(data)
-
-            if data.tag == 'HTML':
-                # Lower case all tag and attribute names
-                data.tag = data.tag.lower()
-                for x in data.iterdescendants():
-                    try:
-                        x.tag = x.tag.lower()
-                        for key, val in list(x.attrib.iteritems()):
-                            del x.attrib[key]
-                            key = key.lower()
-                            x.attrib[key] = val
-                    except:
-                        pass
-
-            # Handle weird (non-HTML/fragment) files
-            if barename(data.tag) != 'html':
-                if barename(data.tag) == 'ncx':
-                    return self._parse_xml(orig_data)
-                self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
-                nroot = etree.fromstring('<html></html>')
-                has_body = False
-                for child in list(data):
-                    if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
-                        has_body = True
-                        break
-                parent = nroot
-                if not has_body:
-                    self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
-                    nroot = etree.fromstring('<html><body/></html>')
-                    parent = nroot[0]
-                for child in list(data.iter()):
-                    oparent = child.getparent()
-                    if oparent is not None:
-                        oparent.remove(child)
-                    parent.append(child)
-                data = nroot
-
-
-            # Force into the XHTML namespace
-            if not namespace(data.tag):
-                self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
-                data.attrib['xmlns'] = XHTML_NS
-                data = etree.tostring(data, encoding=unicode)
-
-                try:
-                    data = etree.fromstring(data, parser=parser)
-                except:
-                    data = data.replace(':=', '=').replace(':>', '>')
-                    data = data.replace('<http:/>', '')
-                    try:
-                        data = etree.fromstring(data, parser=parser)
-                    except etree.XMLSyntaxError:
-                        self.oeb.logger.warn('Stripping comments from %s'%
-                                self.href)
-                        data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
-                                data)
-                        data = data.replace(
-                            "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
-                            '')
-                        data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
-                        try:
-                            data = etree.fromstring(data,
-                                    parser=RECOVER_PARSER)
-                        except etree.XMLSyntaxError:
-                            self.oeb.logger.warn('Stripping meta tags from %s'%
-                                self.href)
-                            data = re.sub(r'<meta\s+[^>]+?>', '', data)
-                            data = etree.fromstring(data, parser=RECOVER_PARSER)
-            elif namespace(data.tag) != XHTML_NS:
-                # OEB_DOC_NS, but possibly others
-                ns = namespace(data.tag)
-                attrib = dict(data.attrib)
-                nroot = etree.Element(XHTML('html'),
-                    nsmap={None: XHTML_NS}, attrib=attrib)
-                for elem in data.iterdescendants():
-                    if isinstance(elem.tag, basestring) and \
-                       namespace(elem.tag) == ns:
-                        elem.tag = XHTML(barename(elem.tag))
-                for elem in data:
-                    nroot.append(elem)
-                data = nroot
-
-            data = merge_multiple_html_heads_and_bodies(data, self.oeb.logger)
-            # Ensure has a <head/>
-            head = xpath(data, '/h:html/h:head')
-            head = head[0] if head else None
-            if head is None:
-                self.oeb.logger.warn(
-                    'File %r missing <head/> element' % self.href)
-                head = etree.Element(XHTML('head'))
-                data.insert(0, head)
-                title = etree.SubElement(head, XHTML('title'))
-                title.text = self.oeb.translate(__('Unknown'))
-            elif not xpath(data, '/h:html/h:head/h:title'):
-                self.oeb.logger.warn(
-                    'File %r missing <title/> element' % self.href)
-                title = etree.SubElement(head, XHTML('title'))
-                title.text = self.oeb.translate(__('Unknown'))
-            # Remove any encoding-specifying <meta/> elements
-            for meta in self.META_XP(data):
-                meta.getparent().remove(meta)
-            etree.SubElement(head, XHTML('meta'),
-                attrib={'http-equiv': 'Content-Type',
-                        'content': '%s; charset=utf-8' % XHTML_NS})
-            # Ensure has a <body/>
-            if not xpath(data, '/h:html/h:body'):
-                body = xpath(data, '//h:body')
-                if body:
-                    body = body[0]
-                    body.getparent().remove(body)
-                    data.append(body)
-                else:
-                    self.oeb.logger.warn(
-                        'File %r missing <body/> element' % self.href)
-                    etree.SubElement(data, XHTML('body'))
-
-            # Remove microsoft office markup
-            r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
-            for x in r:
-                x.tag = XHTML('span')
-
-            # Remove lang redefinition inserted by the amazing Microsoft Word!
-            body = xpath(data, '/h:html/h:body')[0]
-            for key in list(body.attrib.keys()):
-                if key == 'lang' or key.endswith('}lang'):
-                    body.attrib.pop(key)
-
-            def remove_elem(a):
-                p = a.getparent()
-                idx = p.index(a) -1
-                p.remove(a)
-                if a.tail:
-                    if idx <= 0:
-                        if p.text is None:
-                            p.text = ''
-                        p.text += a.tail
-                    else:
-                        if p[idx].tail is None:
-                            p[idx].tail = ''
-                        p[idx].tail += a.tail
-
-            # Remove hyperlinks with no content as they cause rendering
-            # artifacts in browser based renderers
-            # Also remove empty <b>, <u> and <i> tags
-            for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
-                if a.get('id', None) is None and a.get('name', None) is None \
-                        and len(a) == 0 and not a.text:
-                    remove_elem(a)
-
-            # Convert <br>s with content into paragraphs as ADE can't handle
-            # them
-            for br in xpath(data, '//h:br'):
-                if len(br) > 0 or br.text:
-                    br.tag = XHTML('div')
-
+                data = parse_html(data, log=self.oeb.log,
+                        decoder=self.oeb.decode,
+                        preprocessor=self.oeb.html_preprocessor,
+                        filename=fname, non_html_file_tags={'ncx'})
+            except NotHTML:
+                return self._parse_xml(orig_data)
            return data

        def _parse_txt(self, data):
@ -1629,9 +1362,10 @@ class TOC(object):
    :attr:`id`: Option unique identifier for this node.
    :attr:`author`: Optional author attribution for periodicals <mbp:>
    :attr:`description`: Optional description attribute for periodicals <mbp:>
+    :attr:`toc_thumbnail`: Optional toc thumbnail image
    """
    def __init__(self, title=None, href=None, klass=None, id=None,
-            play_order=None, author=None, description=None):
+            play_order=None, author=None, description=None, toc_thumbnail=None):
        self.title = title
        self.href = urlnormalize(href) if href else href
        self.klass = klass
@ -1643,10 +1377,11 @@ class TOC(object):
        self.play_order = play_order
        self.author = author
        self.description = description
+        self.toc_thumbnail = toc_thumbnail

-    def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None):
+    def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
        """Create and return a new sub-node of this node."""
-        node = TOC(title, href, klass, id, play_order, author, description)
+        node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
        self.nodes.append(node)
        return node

--- a/src/calibre/ebooks/oeb/display/cfi.coffee
+++ b/src/calibre/ebooks/oeb/display/cfi.coffee
@ -0,0 +1,225 @@
+#!/usr/bin/env coffee
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+###
+ Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
+ Released under the GPLv3 License
+###
+#
+log = (error) ->
+    if error
+        if window?.console?.log
+            window.console.log(error)
+        else if process?.stdout?.write
+            process.stdout.write(error + '\n')
+
+# CFI escaping {{{
+escape_for_cfi = (raw) ->
+    if raw
+        for c in ['^', '[', ']', ',', '(', ')', ';', '~', '@', '-', '!']
+            raw = raw.replace(c, '^'+c)
+    raw
+
+unescape_from_cfi = (raw) ->
+    ans = raw
+    if raw
+        dropped = false
+        ans = []
+        for c in raw
+            if not dropped and c == '^'
+                dropped = true
+                continue
+            dropped = false
+            ans.push(c)
+        ans = ans.join('')
+    ans
+# }}}
+
+fstr = (d) -> # {{{
+    # Convert a timestamp floating point number to a string
+    ans = ""
+    if ( d < 0 )
+        ans = "-"
+        d = -d
+    n = Math.floor(d)
+    ans += n
+    n = Math.round((d-n)*100)
+    if( n != 0 )
+        ans += "."
+        ans += if (n % 10 == 0) then (n/10) else n
+    ans
+# }}}
+
+class CanonicalFragmentIdentifier
+
+    # This class is a namespace to expose CFI functions via the window.cfi
+    # object
+
+    constructor: () ->
+
+    encode: (doc, node, offset, tail) -> # {{{
+        cfi = tail or ""
+
+        # Handle the offset, if any
+        switch node.nodeType
+            when 1 # Element node
+                if typeoff(offset) == 'number'
+                    node = node.childNodes.item(offset)
+            when 3, 4, 5, 6 # Text/entity/CDATA node
+                offset or= 0
+                while true
+                    p = node.previousSibling
+                    if (p?.nodeType not in [3, 4, 5, 6])
+                        break
+                    offset += p.nodeValue.length
+                    node = p
+                cfi = ":" + offset + cfi
+            else # Not handled
+                log("Offsets for nodes of type #{ node.nodeType } are not handled")
+
+        # Construct the path to node from root
+        until node == doc
+            p = node.parentNode
+            if not p
+                if node.nodeType == 9 # Document node (iframe)
+                    win = node.defaultView
+                    if win.frameElement
+                        node = win.frameElement
+                        cfi = "!" + cfi
+                        continue
+                break
+            # Increase index by the length of all previous sibling text nodes
+            index = 0
+            child = p.firstChild
+            while true
+                index |= 1
+                if child.nodeType in [1, 7]
+                    index++
+                if child == node
+                    break
+                child = child.nextSibling
+
+            # Add id assertions for robustness where possible
+            id = node.getAttribute?('id')
+            idspec = if id then "[#{ escape_for_cfi(id) }]" else ''
+            cfi = '/' + index + idspec + cfi
+            node = p
+
+        cfi
+    # }}}
+
+    decode: (cfi, doc=window?.document) -> # {{{
+        simple_node_regex = ///
+            ^/(\d+)          # The node count
+              (\[[^\]]*\])?  # The optional id assertion
+        ///
+        error = null
+        node = doc
+
+        until cfi.length <= 0 or error
+            if ( (r = cfi.match(simple_node_regex)) is not null ) # Path step
+                target = parseInt(r[1])
+                assertion = r[2]
+                if assertion
+                    assertion = unescape_from_cfi(assertion.slice(1, assertion.length-1))
+                index = 0
+                child = node.firstChild
+
+                while true
+                    if not child
+                        if assertion # Try to use the assertion to find the node
+                            child = doc.getElementById(assertion)
+                            if child
+                                node = child
+                        if not child
+                            error = "No matching child found for CFI: " + cfi
+                        break
+                    index |= 1 # Increment index by 1 if it is even
+                    if child.nodeType in [1, 7] # We have an element or a PI
+                        index++
+                    if ( index == target )
+                        cfi = cfi.substr(r[0].length)
+                        node = child
+                        break
+                    child = child.nextSibling
+
+            else if cfi[0] == '!' # Indirection
+                if node.contentDocument
+                    node = node.contentDocument
+                    cfi = cfi.substr(1)
+                else
+                    error = "Cannot reference #{ node.nodeName }'s content:" + cfi
+
+            else
+                break
+
+        if error
+            log(error)
+            return null
+
+        point = {}
+        error = null
+
+        point
+
+    # }}}
+
+    at: (x, y, doc=window?.document) -> # {{{
+        cdoc = doc
+        target = null
+        cwin = cdoc.defaultView
+        tail = ''
+        offset = null
+        name = null
+
+        # Drill down into iframes, etc.
+        while true
+            target = cdoc.elementFromPoint x, y
+            if not target or target.localName == 'html'
+                log("No element at (#{ x }, #{ y })")
+                return null
+
+            name = target.localName
+            if name not in ['iframe', 'embed', 'object']
+                break
+
+            cd = target.contentDocument
+            if not cd
+                break
+
+            x = x + cwin.pageXOffset - target.offsetLeft
+            y = y + cwin.pageYOffset - target.offsetTop
+            cdoc = cd
+            cwin = cdoc.defaultView
+
+        target.normalize()
+
+        if name in ['audio', 'video']
+            tail = "~" + fstr target.currentTime
+
+        if name in ['img', 'video']
+            px = ((x + cwin.scrollX - target.offsetLeft)*100)/target.offsetWidth
+            py = ((y + cwin.scrollY - target.offsetTop)*100)/target.offsetHeight
+            tail = "#{ tail }@#{ fstr px },#{ fstr py }"
+        else if name != 'audio'
+            if cdoc.caretRangeFromPoint # WebKit
+                range = cdoc.caretRangeFromPoint(x, y)
+                if range
+                    target = range.startContainer
+                    offset = range.startOffset
+            else
+                # TODO: implement a span bisection algorithm for UAs
+                # without caretRangeFromPoint (Gecko, IE)
+
+        this.encode(doc, target, offset, tail)
+    # }}}
+
+if window?
+    window.cfi = new CanonicalFragmentIdentifier()
+else if process?
+    # Some debugging code goes here to be run with the coffee interpreter
+    cfi = new CanonicalFragmentIdentifier()
+    t = 'a^!,1'
+    log(t)
+    log(escape_for_cfi(t))
+    log(unescape_from_cfi(escape_for_cfi(t)))
--- a/src/calibre/ebooks/oeb/display/test/cfi-test.coffee
+++ b/src/calibre/ebooks/oeb/display/test/cfi-test.coffee
@ -0,0 +1,24 @@
+#!/usr/bin/env coffee
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+###
+ Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
+ Released under the GPLv3 License
+###
+
+viewport_top = (node) ->
+    $(node).offset().top - window.pageYOffset
+
+viewport_left = (node) ->
+    $(node).offset().left - window.pageXOffset
+
+window.onload = ->
+    h1 = document.getElementsByTagName('h1')[0]
+    x = h1.scrollLeft + 150
+    y = viewport_top(h1) + h1.offsetHeight/2
+    e = document.elementFromPoint x, y
+    if e.getAttribute('id') != 'first-h1'
+        alert 'Failed to find top h1'
+        return
+    alert window.cfi.at x, y
+
--- a/src/calibre/ebooks/oeb/display/test/test.html
+++ b/src/calibre/ebooks/oeb/display/test/test.html
@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <title>Testing CFI functionality</title>
+        <script type="text/javascript" src="cfi.js"></script>
+        <script type="text/javascript" src="jquery.js"></script>
+        <script type="text/javascript" src="cfi-test.js"></script>
+    </head>
+    <body>
+        <h1 id="first-h1" style="border: solid 1px red">Testing CFI functionality</h1>
+    </body>
+</html>
+
+
--- a/src/calibre/ebooks/oeb/display/test/test.py
+++ b/src/calibre/ebooks/oeb/display/test/test.py
@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+try:
+    from calibre.utils.coffeescript import serve
+except ImportError:
+    import init_calibre
+    if False: init_calibre, serve
+    from calibre.utils.coffeescript import serve
+
+
+def run_devel_server():
+    os.chdir(os.path.dirname(__file__))
+    serve(['../cfi.coffee', 'cfi-test.coffee'])
+
+if __name__ == '__main__':
+    run_devel_server()
+
--- a/src/calibre/ebooks/oeb/entitydefs.py
+++ b/src/calibre/ebooks/oeb/entitydefs.py
@ -1,256 +0,0 @@
-"""
-Replacement for htmlentitydefs which uses purely numeric entities.
-"""
-
-__license__   = 'GPL v3'
-__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
-
-ENTITYDEFS = \
-    {'AElig': '&#198;',
-     'Aacute': '&#193;',
-     'Acirc': '&#194;',
-     'Agrave': '&#192;',
-     'Alpha': '&#913;',
-     'Aring': '&#197;',
-     'Atilde': '&#195;',
-     'Auml': '&#196;',
-     'Beta': '&#914;',
-     'Ccedil': '&#199;',
-     'Chi': '&#935;',
-     'Dagger': '&#8225;',
-     'Delta': '&#916;',
-     'ETH': '&#208;',
-     'Eacute': '&#201;',
-     'Ecirc': '&#202;',
-     'Egrave': '&#200;',
-     'Epsilon': '&#917;',
-     'Eta': '&#919;',
-     'Euml': '&#203;',
-     'Gamma': '&#915;',
-     'Iacute': '&#205;',
-     'Icirc': '&#206;',
-     'Igrave': '&#204;',
-     'Iota': '&#921;',
-     'Iuml': '&#207;',
-     'Kappa': '&#922;',
-     'Lambda': '&#923;',
-     'Mu': '&#924;',
-     'Ntilde': '&#209;',
-     'Nu': '&#925;',
-     'OElig': '&#338;',
-     'Oacute': '&#211;',
-     'Ocirc': '&#212;',
-     'Ograve': '&#210;',
-     'Omega': '&#937;',
-     'Omicron': '&#927;',
-     'Oslash': '&#216;',
-     'Otilde': '&#213;',
-     'Ouml': '&#214;',
-     'Phi': '&#934;',
-     'Pi': '&#928;',
-     'Prime': '&#8243;',
-     'Psi': '&#936;',
-     'Rho': '&#929;',
-     'Scaron': '&#352;',
-     'Sigma': '&#931;',
-     'THORN': '&#222;',
-     'Tau': '&#932;',
-     'Theta': '&#920;',
-     'Uacute': '&#218;',
-     'Ucirc': '&#219;',
-     'Ugrave': '&#217;',
-     'Upsilon': '&#933;',
-     'Uuml': '&#220;',
-     'Xi': '&#926;',
-     'Yacute': '&#221;',
-     'Yuml': '&#376;',
-     'Zeta': '&#918;',
-     'aacute': '&#225;',
-     'acirc': '&#226;',
-     'acute': '&#180;',
-     'aelig': '&#230;',
-     'agrave': '&#224;',
-     'alefsym': '&#8501;',
-     'alpha': '&#945;',
-     'and': '&#8743;',
-     'ang': '&#8736;',
-     'aring': '&#229;',
-     'asymp': '&#8776;',
-     'atilde': '&#227;',
-     'auml': '&#228;',
-     'bdquo': '&#8222;',
-     'beta': '&#946;',
-     'brvbar': '&#166;',
-     'bull': '&#8226;',
-     'cap': '&#8745;',
-     'ccedil': '&#231;',
-     'cedil': '&#184;',
-     'cent': '&#162;',
-     'chi': '&#967;',
-     'circ': '&#710;',
-     'clubs': '&#9827;',
-     'cong': '&#8773;',
-     'copy': '&#169;',
-     'crarr': '&#8629;',
-     'cup': '&#8746;',
-     'curren': '&#164;',
-     'dArr': '&#8659;',
-     'dagger': '&#8224;',
-     'darr': '&#8595;',
-     'deg': '&#176;',
-     'delta': '&#948;',
-     'diams': '&#9830;',
-     'divide': '&#247;',
-     'eacute': '&#233;',
-     'ecirc': '&#234;',
-     'egrave': '&#232;',
-     'empty': '&#8709;',
-     'emsp': '&#8195;',
-     'ensp': '&#8194;',
-     'epsilon': '&#949;',
-     'equiv': '&#8801;',
-     'eta': '&#951;',
-     'eth': '&#240;',
-     'euml': '&#235;',
-     'euro': '&#8364;',
-     'exist': '&#8707;',
-     'fnof': '&#402;',
-     'forall': '&#8704;',
-     'frac12': '&#189;',
-     'frac14': '&#188;',
-     'frac34': '&#190;',
-     'frasl': '&#8260;',
-     'gamma': '&#947;',
-     'ge': '&#8805;',
-     'hArr': '&#8660;',
-     'harr': '&#8596;',
-     'hearts': '&#9829;',
-     'hellip': '&#8230;',
-     'iacute': '&#237;',
-     'icirc': '&#238;',
-     'iexcl': '&#161;',
-     'igrave': '&#236;',
-     'image': '&#8465;',
-     'infin': '&#8734;',
-     'int': '&#8747;',
-     'iota': '&#953;',
-     'iquest': '&#191;',
-     'isin': '&#8712;',
-     'iuml': '&#239;',
-     'kappa': '&#954;',
-     'lArr': '&#8656;',
-     'lambda': '&#955;',
-     'lang': '&#9001;',
-     'laquo': '&#171;',
-     'larr': '&#8592;',
-     'lceil': '&#8968;',
-     'ldquo': '&#8220;',
-     'le': '&#8804;',
-     'lfloor': '&#8970;',
-     'lowast': '&#8727;',
-     'loz': '&#9674;',
-     'lrm': '&#8206;',
-     'lsaquo': '&#8249;',
-     'lsquo': '&#8216;',
-     'macr': '&#175;',
-     'mdash': '&#8212;',
-     'micro': '&#181;',
-     'middot': '&#183;',
-     'minus': '&#8722;',
-     'mu': '&#956;',
-     'nabla': '&#8711;',
-     'nbsp': '&#160;',
-     'ndash': '&#8211;',
-     'ne': '&#8800;',
-     'ni': '&#8715;',
-     'not': '&#172;',
-     'notin': '&#8713;',
-     'nsub': '&#8836;',
-     'ntilde': '&#241;',
-     'nu': '&#957;',
-     'oacute': '&#243;',
-     'ocirc': '&#244;',
-     'oelig': '&#339;',
-     'ograve': '&#242;',
-     'oline': '&#8254;',
-     'omega': '&#969;',
-     'omicron': '&#959;',
-     'oplus': '&#8853;',
-     'or': '&#8744;',
-     'ordf': '&#170;',
-     'ordm': '&#186;',
-     'oslash': '&#248;',
-     'otilde': '&#245;',
-     'otimes': '&#8855;',
-     'ouml': '&#246;',
-     'para': '&#182;',
-     'part': '&#8706;',
-     'permil': '&#8240;',
-     'perp': '&#8869;',
-     'phi': '&#966;',
-     'pi': '&#960;',
-     'piv': '&#982;',
-     'plusmn': '&#177;',
-     'pound': '&#163;',
-     'prime': '&#8242;',
-     'prod': '&#8719;',
-     'prop': '&#8733;',
-     'psi': '&#968;',
-     'rArr': '&#8658;',
-     'radic': '&#8730;',
-     'rang': '&#9002;',
-     'raquo': '&#187;',
-     'rarr': '&#8594;',
-     'rceil': '&#8969;',
-     'rdquo': '&#8221;',
-     'real': '&#8476;',
-     'reg': '&#174;',
-     'rfloor': '&#8971;',
-     'rho': '&#961;',
-     'rlm': '&#8207;',
-     'rsaquo': '&#8250;',
-     'rsquo': '&#8217;',
-     'sbquo': '&#8218;',
-     'scaron': '&#353;',
-     'sdot': '&#8901;',
-     'sect': '&#167;',
-     'shy': '&#173;',
-     'sigma': '&#963;',
-     'sigmaf': '&#962;',
-     'sim': '&#8764;',
-     'spades': '&#9824;',
-     'sub': '&#8834;',
-     'sube': '&#8838;',
-     'sum': '&#8721;',
-     'sup': '&#8835;',
-     'sup1': '&#185;',
-     'sup2': '&#178;',
-     'sup3': '&#179;',
-     'supe': '&#8839;',
-     'szlig': '&#223;',
-     'tau': '&#964;',
-     'there4': '&#8756;',
-     'theta': '&#952;',
-     'thetasym': '&#977;',
-     'thinsp': '&#8201;',
-     'thorn': '&#254;',
-     'tilde': '&#732;',
-     'times': '&#215;',
-     'trade': '&#8482;',
-     'uArr': '&#8657;',
-     'uacute': '&#250;',
-     'uarr': '&#8593;',
-     'ucirc': '&#251;',
-     'ugrave': '&#249;',
-     'uml': '&#168;',
-     'upsih': '&#978;',
-     'upsilon': '&#965;',
-     'uuml': '&#252;',
-     'weierp': '&#8472;',
-     'xi': '&#958;',
-     'yacute': '&#253;',
-     'yen': '&#165;',
-     'yuml': '&#255;',
-     'zeta': '&#950;',
-     'zwj': '&#8205;',
-     'zwnj': '&#8204;'}
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -0,0 +1,347 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from lxml import etree, html
+
+from calibre import xml_replace_entities, force_unicode
+from calibre.constants import filesystem_encoding
+from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
+
+RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
+XHTML_NS     = 'http://www.w3.org/1999/xhtml'
+
+class NotHTML(Exception):
+
+    def __init__(self, root_tag):
+        Exception.__init__(self, 'Data is not HTML')
+        self.root_tag = root_tag
+
+def barename(name):
+    return name.rpartition('}')[-1]
+
+def namespace(name):
+    if '}' in name:
+        return name.split('}', 1)[0][1:]
+    return ''
+
+def XHTML(name):
+    return '{%s}%s' % (XHTML_NS, name)
+
+def xpath(elem, expr):
+    return elem.xpath(expr, namespaces={'h':XHTML_NS})
+
+def XPath(expr):
+    return etree.XPath(expr, namespaces={'h':XHTML_NS})
+
+META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
+
+def merge_multiple_html_heads_and_bodies(root, log=None):
+    heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
+    if not (len(heads) > 1 or len(bodies) > 1): return root
+    for child in root: root.remove(child)
+    head = root.makeelement(XHTML('head'))
+    body = root.makeelement(XHTML('body'))
+    for h in heads:
+        for x in h:
+            head.append(x)
+    for b in bodies:
+        for x in b:
+            body.append(x)
+    map(root.append, (head, body))
+    if log is not None:
+        log.warn('Merging multiple <head> and <body> sections')
+    return root
+
+def _html5_parse(data):
+    import html5lib
+    data = html5lib.parse(data, treebuilder='lxml').getroot()
+    html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and
+            ns is not None)]
+    if html_ns:
+        # html5lib causes the XHTML namespace to not
+        # be set as the default namespace
+        nsmap = dict(data.nsmap)
+        nsmap[None] = XHTML_NS
+        for x in html_ns:
+            nsmap.pop(x)
+        nroot = etree.Element(data.tag, nsmap=nsmap,
+                attrib=dict(data.attrib))
+        nroot.text = data.text
+        nroot.tail = data.tail
+        for child in data:
+            nroot.append(child)
+        data = nroot
+    return data
+
+def _html4_parse(data, prefer_soup=False):
+    if prefer_soup:
+        from calibre.utils.soupparser import fromstring
+        data = fromstring(data)
+    else:
+        data = html.fromstring(data)
+    data.attrib.pop('xmlns', None)
+    for elem in data.iter(tag=etree.Comment):
+        if elem.text:
+            elem.text = elem.text.strip('-')
+    data = etree.tostring(data, encoding=unicode)
+
+    # Setting huge_tree=True causes crashes in windows with large files
+    parser = etree.XMLParser(no_network=True)
+    try:
+        data = etree.fromstring(data, parser=parser)
+    except etree.XMLSyntaxError:
+        data = etree.fromstring(data, parser=RECOVER_PARSER)
+    return data
+
+def clean_word_doc(data, log):
+    prefixes = []
+    for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
+        prefixes.append(match.group(1))
+    if prefixes:
+        log.warn('Found microsoft markup, cleaning...')
+        # Remove empty tags as they are not rendered by browsers
+        # but can become renderable HTML tags like <p/> if the
+        # document is parsed by an HTML parser
+        pat = re.compile(
+                r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
+                re.DOTALL)
+        data = pat.sub('', data)
+        pat = re.compile(
+                r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
+        data = pat.sub('', data)
+    return data
+
+def parse_html(data, log=None, decoder=None, preprocessor=None,
+        filename='<string>', non_html_file_tags=frozenset()):
+    if log is None:
+        from calibre.utils.logging import default_log
+        log = default_log
+
+    filename = force_unicode(filename, enc=filesystem_encoding)
+
+    if not isinstance(data, unicode):
+        if decoder is not None:
+            data = decoder(data)
+        else:
+            data = xml_to_unicode(data)[0]
+
+    data = strip_encoding_declarations(data)
+    if preprocessor is not None:
+        data = preprocessor(data)
+
+    # There could be null bytes in data if it had &#0; entities in it
+    data = data.replace('\0', '')
+
+    # Remove DOCTYPE declaration as it messes up parsing
+    # In particular, it causes tostring to insert xmlns
+    # declarations, which messes up the coercing logic
+    idx = data.find('<html')
+    if idx == -1:
+        idx = data.find('<HTML')
+    if idx > -1:
+        pre = data[:idx]
+        data = data[idx:]
+        if '<!DOCTYPE' in pre: # Handle user defined entities
+            user_entities = {}
+            for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
+                val = match.group(2)
+                if val.startswith('"') and val.endswith('"'):
+                    val = val[1:-1]
+                user_entities[match.group(1)] = val
+            if user_entities:
+                pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
+                data = pat.sub(lambda m:user_entities[m.group(1)], data)
+
+    data = clean_word_doc(data, log)
+
+    # Setting huge_tree=True causes crashes in windows with large files
+    parser = etree.XMLParser(no_network=True)
+
+    # Try with more & more drastic measures to parse
+    try:
+        data = etree.fromstring(data, parser=parser)
+    except etree.XMLSyntaxError:
+        log.debug('Initial parse failed, using more'
+                ' forgiving parsers')
+        data = xml_replace_entities(data)
+        try:
+            data = etree.fromstring(data, parser=parser)
+        except etree.XMLSyntaxError:
+            log.debug('Parsing %s as HTML' % filename)
+            try:
+                data = _html5_parse(data)
+            except:
+                log.exception(
+                    'HTML 5 parsing failed, falling back to older parsers')
+                data = _html4_parse(data)
+
+    if data.tag == 'HTML':
+        # Lower case all tag and attribute names
+        data.tag = data.tag.lower()
+        for x in data.iterdescendants():
+            try:
+                x.tag = x.tag.lower()
+                for key, val in list(x.attrib.iteritems()):
+                    del x.attrib[key]
+                    key = key.lower()
+                    x.attrib[key] = val
+            except:
+                pass
+
+    if barename(data.tag) != 'html':
+        if barename(data.tag) in non_html_file_tags:
+            raise NotHTML(data.tag)
+        log.warn('File %r does not appear to be (X)HTML'%filename)
+        nroot = etree.fromstring('<html></html>')
+        has_body = False
+        for child in list(data):
+            if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
+                has_body = True
+                break
+        parent = nroot
+        if not has_body:
+            log.warn('File %r appears to be a HTML fragment'%filename)
+            nroot = etree.fromstring('<html><body/></html>')
+            parent = nroot[0]
+        for child in list(data.iter()):
+            oparent = child.getparent()
+            if oparent is not None:
+                oparent.remove(child)
+            parent.append(child)
+        data = nroot
+
+    # Force into the XHTML namespace
+    if not namespace(data.tag):
+        log.warn('Forcing', filename, 'into XHTML namespace')
+        data.attrib['xmlns'] = XHTML_NS
+        data = etree.tostring(data, encoding=unicode)
+
+        try:
+            data = etree.fromstring(data, parser=parser)
+        except:
+            data = data.replace(':=', '=').replace(':>', '>')
+            data = data.replace('<http:/>', '')
+            try:
+                data = etree.fromstring(data, parser=parser)
+            except etree.XMLSyntaxError:
+                log.warn('Stripping comments from %s'%
+                        filename)
+                data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
+                        data)
+                data = data.replace(
+                    "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
+                    '')
+                data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
+                try:
+                    data = etree.fromstring(data,
+                            parser=RECOVER_PARSER)
+                except etree.XMLSyntaxError:
+                    log.warn('Stripping meta tags from %s'% filename)
+                    data = re.sub(r'<meta\s+[^>]+?>', '', data)
+                    data = etree.fromstring(data, parser=RECOVER_PARSER)
+    elif namespace(data.tag) != XHTML_NS:
+        # OEB_DOC_NS, but possibly others
+        ns = namespace(data.tag)
+        attrib = dict(data.attrib)
+        nroot = etree.Element(XHTML('html'),
+            nsmap={None: XHTML_NS}, attrib=attrib)
+        for elem in data.iterdescendants():
+            if isinstance(elem.tag, basestring) and \
+                namespace(elem.tag) == ns:
+                elem.tag = XHTML(barename(elem.tag))
+        for elem in data:
+            nroot.append(elem)
+        data = nroot
+
+    data = merge_multiple_html_heads_and_bodies(data, log)
+    # Ensure has a <head/>
+    head = xpath(data, '/h:html/h:head')
+    head = head[0] if head else None
+    if head is None:
+        log.warn('File %s missing <head/> element' % filename)
+        head = etree.Element(XHTML('head'))
+        data.insert(0, head)
+        title = etree.SubElement(head, XHTML('title'))
+        title.text = _('Unknown')
+    elif not xpath(data, '/h:html/h:head/h:title'):
+        log.warn('File %s missing <title/> element' % filename)
+        title = etree.SubElement(head, XHTML('title'))
+        title.text = _('Unknown')
+    # Remove any encoding-specifying <meta/> elements
+    for meta in META_XP(data):
+        meta.getparent().remove(meta)
+    etree.SubElement(head, XHTML('meta'),
+        attrib={'http-equiv': 'Content-Type',
+                'content': '%s; charset=utf-8' % XHTML_NS})
+    # Ensure has a <body/>
+    if not xpath(data, '/h:html/h:body'):
+        body = xpath(data, '//h:body')
+        if body:
+            body = body[0]
+            body.getparent().remove(body)
+            data.append(body)
+        else:
+            log.warn('File %s missing <body/> element' % filename)
+            etree.SubElement(data, XHTML('body'))
+
+    # Remove microsoft office markup
+    r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
+    for x in r:
+        x.tag = XHTML('span')
+
+    # Remove lang redefinition inserted by the amazing Microsoft Word!
+    body = xpath(data, '/h:html/h:body')[0]
+    for key in list(body.attrib.keys()):
+        if key == 'lang' or key.endswith('}lang'):
+            body.attrib.pop(key)
+
+    def remove_elem(a):
+        p = a.getparent()
+        idx = p.index(a) -1
+        p.remove(a)
+        if a.tail:
+            if idx <= 0:
+                if p.text is None:
+                    p.text = ''
+                p.text += a.tail
+            else:
+                if p[idx].tail is None:
+                    p[idx].tail = ''
+                p[idx].tail += a.tail
+
+    # Remove hyperlinks with no content as they cause rendering
+    # artifacts in browser based renderers
+    # Also remove empty <b>, <u> and <i> tags
+    for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
+        if a.get('id', None) is None and a.get('name', None) is None \
+                and len(a) == 0 and not a.text:
+            remove_elem(a)
+
+    # Convert <br>s with content into paragraphs as ADE can't handle
+    # them
+    for br in xpath(data, '//h:br'):
+        if len(br) > 0 or br.text:
+            br.tag = XHTML('div')
+
+    # Remove any stray text in the <head> section and format it nicely
+    data.text = '\n  '
+    head = xpath(data, '//h:head')
+    if head:
+        head = head[0]
+        head.text = '\n    '
+        head.tail = '\n  '
+        for child in head:
+            child.tail = '\n    '
+        child.tail = '\n  '
+
+    return data
+
+
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -19,16 +19,15 @@ from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
 from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
    PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
 from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
-    ENTITY_RE, MS_COVER_TYPE, iterlinks
+    MS_COVER_TYPE, iterlinks
 from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
                                    urlnormalize, BINARY_MIME, \
                                    OEBError, OEBBook, DirContainer
 from calibre.ebooks.oeb.writer import OEBWriter
-from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.utils.localization import get_lang
 from calibre.ptempfile import TemporaryDirectory
 from calibre.constants import __appname__, __version__
-from calibre import guess_type
+from calibre import guess_type, xml_replace_entities

 __all__ = ['OEBReader']

@ -107,8 +106,7 @@ class OEBReader(object):
        try:
            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
-            repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
-            data = ENTITY_RE.sub(repl, data)
+            data = xml_replace_entities(data, encoding=None)
            try:
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
@ -371,8 +369,15 @@ class OEBReader(object):
            else :
                description = None

+            index_image = xpath(child,
+                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
+            toc_thumbnail = (index_image[0].text if index_image else None)
+            if not toc_thumbnail or not toc_thumbnail.strip():
+                toc_thumbnail = None
+
            node = toc.add(title, href, id=id, klass=klass,
-                    play_order=po, description=description, author=author)
+                    play_order=po, description=description, author=author,
+                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)

--- a/src/calibre/gui2/book_details.py
+++ b/src/calibre/gui2/book_details.py
@ -56,8 +56,11 @@ def render_html(mi, css, vertical, widget, all_fields=False): # {{{
        </body>
    <html>
    '''%(f, c, css)
+    fm = getattr(mi, 'field_metadata', field_metadata)
+    fl = dict(get_field_list(fm))
+    show_comments = (all_fields or fl.get('comments', True))
    comments = u''
-    if mi.comments:
+    if mi.comments and show_comments:
        comments = comments_to_html(force_unicode(mi.comments))
    right_pane = u'<div id="comments" class="comments">%s</div>'%comments

--- a/src/calibre/gui2/custom_column_widgets.py
+++ b/src/calibre/gui2/custom_column_widgets.py
@ -429,7 +429,7 @@ def populate_metadata_page(layout, db, book_id, bulk=False, two_column=False, pa
    # The fields named here must be first in the widget list
    tweak_cols = tweaks['metadata_edit_custom_column_order']
    comments_in_tweak = 0
-    for key in tweak_cols:
+    for key in (tweak_cols or ()):
        # Add the key if it really exists in the database
        if key in cols_to_display:
            cols.append(key)
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@ -441,7 +441,7 @@ class Scheduler(QObject):
        self.news_menu.addAction(self.cac)
        self.news_menu.addSeparator()
        self.all_action = self.news_menu.addAction(
-                _('Download all scheduled new sources'),
+                _('Download all scheduled news sources'),
                self.download_all_scheduled)

        self.timer = QTimer(self)
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@ -758,11 +758,12 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        self.set_page_number(frac)

    def next_document(self):
-        if self.current_index < len(self.iterator.spine) - 1:
+        if (hasattr(self, 'current_index') and self.current_index <
+                len(self.iterator.spine) - 1):
            self.load_path(self.iterator.spine[self.current_index+1])

    def previous_document(self):
-        if self.current_index > 0:
+        if hasattr(self, 'current_index') and self.current_index > 0:
            self.load_path(self.iterator.spine[self.current_index-1], pos=1.0)

    def keyPressEvent(self, event):
--- a/src/calibre/translations/af.po
+++ b/src/calibre/translations/af.po
--- a/src/calibre/translations/ar.po
+++ b/src/calibre/translations/ar.po
--- a/src/calibre/translations/ast.po
+++ b/src/calibre/translations/ast.po
--- a/src/calibre/translations/az.po
+++ b/src/calibre/translations/az.po
--- a/src/calibre/translations/bg.po
+++ b/src/calibre/translations/bg.po
--- a/src/calibre/translations/bn.po
+++ b/src/calibre/translations/bn.po
--- a/src/calibre/translations/br.po
+++ b/src/calibre/translations/br.po
--- a/src/calibre/translations/bs.po
+++ b/src/calibre/translations/bs.po
--- a/src/calibre/translations/ca.po
+++ b/src/calibre/translations/ca.po
--- a/src/calibre/translations/cs.po
+++ b/src/calibre/translations/cs.po
--- a/src/calibre/translations/da.po
+++ b/src/calibre/translations/da.po
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
--- a/src/calibre/translations/el.po
+++ b/src/calibre/translations/el.po
--- a/src/calibre/translations/en_AU.po
+++ b/src/calibre/translations/en_AU.po
--- a/src/calibre/translations/en_CA.po
+++ b/src/calibre/translations/en_CA.po
--- a/src/calibre/translations/en_GB.po
+++ b/src/calibre/translations/en_GB.po
--- a/src/calibre/translations/eo.po
+++ b/src/calibre/translations/eo.po
--- a/src/calibre/translations/es.po
+++ b/src/calibre/translations/es.po
--- a/src/calibre/translations/et.po
+++ b/src/calibre/translations/et.po
--- a/src/calibre/translations/eu.po
+++ b/src/calibre/translations/eu.po
--- a/src/calibre/translations/fa.po
+++ b/src/calibre/translations/fa.po
--- a/src/calibre/translations/fi.po
+++ b/src/calibre/translations/fi.po
--- a/src/calibre/translations/fo.po
+++ b/src/calibre/translations/fo.po
--- a/src/calibre/translations/fr.po
+++ b/src/calibre/translations/fr.po
--- a/src/calibre/translations/gl.po
+++ b/src/calibre/translations/gl.po
--- a/src/calibre/translations/gu.po
+++ b/src/calibre/translations/gu.po
--- a/src/calibre/translations/he.po
+++ b/src/calibre/translations/he.po
--- a/src/calibre/translations/hi.po
+++ b/src/calibre/translations/hi.po
--- a/src/calibre/translations/hr.po
+++ b/src/calibre/translations/hr.po
--- a/src/calibre/translations/hu.po
+++ b/src/calibre/translations/hu.po
--- a/src/calibre/translations/id.po
+++ b/src/calibre/translations/id.po
--- a/src/calibre/translations/it.po
+++ b/src/calibre/translations/it.po
--- a/src/calibre/translations/ja.po
+++ b/src/calibre/translations/ja.po
--- a/src/calibre/translations/kn.po
+++ b/src/calibre/translations/kn.po
--- a/src/calibre/translations/ko.po
+++ b/src/calibre/translations/ko.po
--- a/src/calibre/translations/ku.po
+++ b/src/calibre/translations/ku.po
--- a/src/calibre/translations/lt.po
+++ b/src/calibre/translations/lt.po
--- a/src/calibre/translations/ltg.po
+++ b/src/calibre/translations/ltg.po
--- a/src/calibre/translations/lv.po
+++ b/src/calibre/translations/lv.po
--- a/src/calibre/translations/mk.po
+++ b/src/calibre/translations/mk.po
--- a/src/calibre/translations/ml.po
+++ b/src/calibre/translations/ml.po
--- a/src/calibre/translations/mr.po
+++ b/src/calibre/translations/mr.po
--- a/src/calibre/translations/ms.po
+++ b/src/calibre/translations/ms.po
--- a/src/calibre/translations/nb.po
+++ b/src/calibre/translations/nb.po
--- a/src/calibre/translations/nds.po
+++ b/src/calibre/translations/nds.po
--- a/src/calibre/translations/nl.po
+++ b/src/calibre/translations/nl.po
--- a/src/calibre/translations/oc.po
+++ b/src/calibre/translations/oc.po
--- a/src/calibre/translations/pa.po
+++ b/src/calibre/translations/pa.po
--- a/src/calibre/translations/pl.po
+++ b/src/calibre/translations/pl.po
--- a/src/calibre/translations/pt.po
+++ b/src/calibre/translations/pt.po
--- a/src/calibre/translations/pt_BR.po
+++ b/src/calibre/translations/pt_BR.po
--- a/src/calibre/translations/ro.po
+++ b/src/calibre/translations/ro.po
--- a/src/calibre/translations/ru.po
+++ b/src/calibre/translations/ru.po
--- a/src/calibre/translations/sc.po
+++ b/src/calibre/translations/sc.po
--- a/src/calibre/translations/si.po
+++ b/src/calibre/translations/si.po
--- a/src/calibre/translations/sk.po
+++ b/src/calibre/translations/sk.po
--- a/Show More
+++ b/Show More