Merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-12-16 09:24:15 +01:00 · 2011-12-16 09:24:15 +01:00 · f9b6ce9470
commit f9b6ce9470
parent 72f6b0385d b857cebfb0
156 changed files with 54033 additions and 20129 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -2,6 +2,8 @@
 .check-cache.pickle
 src/calibre/plugins
 resources/images.qrc
 src/calibre/ebooks/oeb/display/test/*.js
 resources/display/*.js
 src/calibre/manual/.build/
 src/calibre/manual/cli/
 src/calibre/manual/template_ref.rst
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -19,6 +19,65 @@
 #   new recipes:
 #     - title: 
 - version: 0.8.31
  date: 2011-12-16
  new features:
    - title: "Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness."
      tickets: [901466]
    - title: "Driver for PocketBook 611 and Lenovo IdeaPad"
    - title: "Allow customization of the order in which custom column editing is performed in the edit metadata dialog. Setting is available via Preferences->Tweaks."
      tickets: [902731]
    - title: "MOBI news download: Allow recipes to set a thumbnail for entries in the periodical table of contents. Currently used by the NYTimes, WSJ, Independent, GUardian and Globe and Mail recipes"
      tickets: [900130]
    - title: "E-book viewer: Add an option to the right click menu to search for the currently selected word"
    - title: "Automatically hide the no internet connection available error message if the connection is restored before the user clicks OK"
  bug fixes:
    - title: "Fix comments not hidden in Book details panel when they are turned off via Preferences->Look & Feel->Book Details"
    - title: "E-book viewer: Do not popup an error message if the user tries to use the mouse wheel to scroll before a document is loaded."
      tickets: [903449] 
    - title: "Add docx to the list of ebook extensions."
      tickets: [903452]
    - title: "When downloading metadata from non-English Amazon websites, do not correct the case of book titles."
    - title: "Fix regression in 0.8.30 that broke bulk conversion of a single book." 
      tickets: [902506]
    - title: "When minimized to system tray do not display the no internet connection error as a dialog box, instead use a system tray notification"
    - title: "Catalog generation: Include the series_index field for custom series columns as well"
    - title: "Comic Input: Do not rescale images when using the Tablet output profile (or any output profile with a screen size larger than 3000x3000)"
    - title: "HTML Input: Ignore unparseable URLs instead of crashing on them."
      tickets: [902372] 
  improved recipes:
    - La Republica
    - CND
    - Berliner Zeitung
    - Zaman Gazetesi
  new recipes:
    - title: CND Weekly
      author: Derek Liang
    - title: descopera.org 
      author: Marius Ignatescu
    - title: Rynek Zdrowia 
      author: spi630
 - version: 0.8.30
  date: 2011-12-09
--- a/recipes/berliner_zeitung.recipe
+++ b/recipes/berliner_zeitung.recipe
@ -1,61 +1,44 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
-import re
+
 '''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
 class SportsIllustratedRecipe(BasicNewsRecipe) :
-    __author__    = 'ape'
+    __author__    = 'a.peter'
-    __copyright__ = 'ape'
+    __copyright__ = 'a.peter'
    __license__   = 'GPL v3'
    language      = 'de'
-    description   = 'Berliner Zeitung'
+    description   = 'Berliner Zeitung RSS'
-    version       = 2
+    version       = 4
    title         = u'Berliner Zeitung'
    timefmt       = ' [%d.%m.%Y]'
    #oldest_article = 7.0
    no_stylesheets = True
    remove_javascript = True
    use_embedded_content = False
    publication_type = 'newspaper'
-    keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
+    remove_tags_before = dict(name='div', attrs={'class':'newstype'})
    remove_tags_after = [dict(id='article_text')]
-    INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
+    feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
-
+             (u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
-    def parse_index(self):
+             (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
-        base = 'http://www.berlinonline.de'
+             (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
-        answer = []
+             (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
-        articles = {}
+             (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
-        more = 1
+             (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
-
+             (u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
-        soup = self.index_to_soup(self.INDEX)
+             (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
-
+             (u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
-        # Get list of links to ressorts from index page
+             (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
-        ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
+             (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
-        for ressort in ressort_list[0].findAll('a'):
+             (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
-            feed_title = ressort.string
+             (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
-            print 'Analyzing', feed_title
+             (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
            if not articles.has_key(feed_title):
                articles[feed_title] = []
                answer.append(feed_title)
            # Load ressort page.
            feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
            # find mainbar div which contains the list of all articles
            for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
                # iterate over all articles
                for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
                    # extract title of article
                    if article_teaser.h3 != None:
                        article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url'  : base + article_teaser.h3.a['href'], 'description' : u''}
                        articles[feed_title].append(article)
                    else:
                        # Skip teasers for missing photos
                        if article_teaser.div.p.contents[0].find('Foto:') > -1:
                            continue
                        article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
                        articles[feed_title].append(article)
                        more += 1
        answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
        return answer
    def get_masthead_url(self):
-        return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
+        return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
    def print_version(self, url):
        return url.replace('.html', ',view,printVersion.html')
--- a/recipes/cnd.recipe
+++ b/recipes/cnd.recipe
@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
 	remove_tags		= [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
 	no_stylesheets	 = True
-	preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
+	preprocess_regexps = [  (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
 				(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
 				]
 	def print_version(self, url):
 		if url.find('news/article.php') >= 0:
@ -46,13 +48,15 @@ class TheCND(BasicNewsRecipe):
 			title = self.tag_to_string(a)
 			self.log('\tFound article: ', title, 'at', url)
 			date = a.nextSibling
 			if re.search('cm', date):
 				continue
 			if (date is not None) and len(date)>2:
 				if not articles.has_key(date):
 					articles[date] = []
 				articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
 				self.log('\t\tAppend to : ', date)
-		self.log('log articles', articles)
+		#self.log('log articles', articles)
 		mostCurrent = sorted(articles).pop()
 		self.title = 'CND ' + mostCurrent		
--- a/recipes/cnd_weekly.recipe
+++ b/recipes/cnd_weekly.recipe
@ -0,0 +1,72 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
 '''
 cnd.org
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class TheCND(BasicNewsRecipe):
 	title	  = 'CND Weekly'
 	__author__ = 'Derek Liang'
 	description = ''
 	INDEX = 'http://cnd.org'
 	language = 'zh'
 	conversion_options = {'linearize_tables':True}
 	remove_tags_before = dict(name='div', id='articleHead')
 	remove_tags_after  = dict(id='copyright')
 	remove_tags		= [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
 	no_stylesheets	 = True
 	preprocess_regexps = [  (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
 				(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
 				]
 	def print_version(self, url):
 		if url.find('news/article.php') >= 0:
 			return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
 		else:
 			return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
 	def parse_index(self):
 		soup = self.index_to_soup(self.INDEX)
 		feeds = []
 		articles = {}
 		for a in soup.findAll('a', attrs={'target':'_cnd'}):
 			url = a['href']
 			if url.find('article.php') < 0 :
 				continue
 			if url.startswith('/'):
 				url = 'http://cnd.org'+url
 			title = self.tag_to_string(a)
 			date = a.nextSibling
 			if not re.search('cm', date):
 				continue
 			self.log('\tFound article: ', title, 'at', url, '@', date)
 			if (date is not None) and len(date)>2:
 				if not articles.has_key(date):
 					articles[date] = []
 				articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
 				self.log('\t\tAppend to : ', date)
 		sorted_articles = sorted(articles)
 		while sorted_articles:
 			mostCurrent = sorted_articles.pop()
 			self.title = 'CND ' + mostCurrent
 			feeds.append((self.title, articles[mostCurrent]))
 		return feeds
 	def populate_article_metadata(self, article, soup, first):
 		header = soup.find('h3')
 		self.log('header: ' + self.tag_to_string(header))
 		pass
--- a/recipes/descopera_org.recipe
+++ b/recipes/descopera_org.recipe
@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 '''
 descopera.org
 '''
--- a/recipes/globe_and_mail.recipe
+++ b/recipes/globe_and_mail.recipe
@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
            {'class':['articleTools', 'pagination', 'Ads', 'topad',
                'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    #Use the mobile version rather than the web version
    def print_version(self, url):
        return url.rpartition('?')[0] + '?service=mobile'
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@ -79,6 +79,12 @@ class Guardian(BasicNewsRecipe):
              url = None
          return url
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def preprocess_html(self, soup):
          # multiple html sections in soup, useful stuff in the first
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -104,6 +104,13 @@ class TheIndependentNew(BasicNewsRecipe):
            url = None
        return url
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def preprocess_html(self, soup):
        #remove 'advertorial articles'
--- a/recipes/la_republica.recipe
+++ b/recipes/la_republica.recipe
@ -1,13 +1,12 @@
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
 __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
-description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
+description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
 '''
 http://www.repubblica.it/
 '''
 import re
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe
@ -33,12 +32,6 @@ class LaRepubblica(BasicNewsRecipe):
    remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
    preprocess_regexps = [
        (re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
        (re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
        (re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
    ]
    def get_article_url(self, article):
        link = BasicNewsRecipe.get_article_url(self, article)
        if link and not '.repubblica.it/' in link:
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
    remove_tags        = [
                            dict(name=['object','link','meta','iframe','embed']),
                            dict(name='span',attrs={'class':'linkindice'}),
-                            dict(name='div', attrs={'class':'bottom-mobile'}),
+                            dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
-                            dict(name='div', attrs={'id':['rssdiv','blocco']}),
+                            dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
-                            dict(name='div', attrs={'class':'utility'}),
+                            dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
                            dict(name='div', attrs={'class':'generalbox'}),
                            dict(name='ul', attrs={'id':'hystory'})
                         ]
    feeds          = [
-                       (u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
+                       (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
                       (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
                       (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
                       (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@ -110,3 +103,5 @@ class LaRepubblica(BasicNewsRecipe):
            del item['style']           
        return soup
    def preprocess_raw_html(self, raw, url):
       return '<html><head>'+raw[raw.find('</head>'):]
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -1,5 +1,5 @@
 #!/usr/bin/env  python
-
+# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
@ -707,6 +707,16 @@ class NYTimes(BasicNewsRecipe):
 		return soup
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
            if idxdiv is not None:
                if idxdiv.img:
                    self.add_toc_thumbnail(article, idxdiv.img['src'])
            else:
                img = soup.find('img')
                if img is not None:
                    self.add_toc_thumbnail(article, img['src'])
        shortparagraph = ""
        try:
            if len(article.text_summary.strip()) == 0:
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -855,6 +855,16 @@ class NYTimes(BasicNewsRecipe):
        return soup
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
            if idxdiv is not None:
                if idxdiv.img:
                    self.add_toc_thumbnail(article, idxdiv.img['src'])
            else:
                img = soup.find('img')
                if img is not None:
                    self.add_toc_thumbnail(article, img['src'])
        shortparagraph = ""
        try:
            if len(article.text_summary.strip()) == 0:
--- a/recipes/sueddeutsche.recipe
+++ b/recipes/sueddeutsche.recipe
@ -12,39 +12,39 @@ class Sueddeutsche(BasicNewsRecipe):
    title = u'sueddeutsche.de'
    description = 'News from Germany'
-    __author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-11-25
+    __author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-12-16
    use_embedded_content   = False
    timefmt = ' [%d %b %Y]'
-    oldest_article = 7
+    oldest_article = 1#7
-    max_articles_per_feed = 50
+    max_articles_per_feed = 2#50
    no_stylesheets = True
    language = 'de'
-
+    auto_cleanup = True
    encoding = 'utf-8'
    remove_javascript = True
-    cover_url  = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1219199.1322239289!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-11-25 AGe
+    cover_url  = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1236175.1323967473!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-12-16 AGe
-
+# 2011-12-16 AGe
-    remove_tags = [ dict(name='link'), dict(name='iframe'),
+#    remove_tags = [ dict(name='link'), dict(name='iframe'),
-                    dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD",
+#                    dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD",
-                          "SKY_AD","NT1_AD","navbar1","sdesiteheader"]}),
+#                          "SKY_AD","NT1_AD","navbar1","sdesiteheader"]}),
-
+#
-                    dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg",
+#                    dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg",
-                                 "pages closed","basebox right narrow","headslot galleried"]}),
+#                                 "pages closed","basebox right narrow","headslot galleried"]}),
-
+#
-                    dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2",
+#                    dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2",
-                             "item","videoBigButton","articlefooter full-column",
+#                             "item","videoBigButton","articlefooter full-column",
-                                                     "bildbanderolle full-column","footerCopy padleft5"]}),
+#                                                     "bildbanderolle full-column","footerCopy padleft5"]}),
-
+#
-                    dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}),
+#                    dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}),
-                    dict(name='div', attrs={'style':["position:relative;"]}),
+#                    dict(name='div', attrs={'style':["position:relative;"]}),
-                    dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}),
+#                    dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}),
-                    dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
+#                    dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
-                    dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}),
+#                    dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}),
-                    dict(name='td', attrs={'class':["artikelDruckenRight"]}),
+#                    dict(name='td', attrs={'class':["artikelDruckenRight"]}),
-                    dict(name='p', text = "ANZEIGE")
+#                    dict(name='p', text = "ANZEIGE")
-                     ]
+#                     ]
-    remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})]
+#    remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})]
-
+#
    extra_css = '''
                    h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;}
                    a{font-family:Arial,Helvetica,sans-serif; font-style:italic;}
@ -53,30 +53,45 @@ class Sueddeutsche(BasicNewsRecipe):
                    .artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; }
                    body{font-family:Arial,Helvetica,sans-serif; }
                    .photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;}                 '''
-
+#
    feeds = [
-              (u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'),
+#              (u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'),
+#              (u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'),
+#              (u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'),
+#              (u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
+#              (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
+#              (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
+#              (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13
+#              (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13
-              (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
+#              (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
+#              (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'),
+#              (u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'),
+#              (u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'),
+#              (u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'),
+#              (u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'),
+#              (u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'), #AGe 2011-12-16 deactivated
-              (u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only
+#              (u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
-              (u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'),     # sometimes only
+#              (u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'),     # sometimes only #AGe 2011-12-16 deactivated
-              (u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'),         # sometimes only
+#              (u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'),         # sometimes only #AGe 2011-12-16 deactivated
-              (u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only
+#              (u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
-              (u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'),   # sometimes only
+#              (u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'),   # sometimes only #AGe 2011-12-16 deactivated
              (u'Politik', u'http://www.sueddeutsche.de/app/service/rss/ressort/politik/rss.xml'),
              (u'Wirtschaft', u'http://www.sueddeutsche.de/app/service/rss/ressort/wirtschaft/rss.xml'),
              (u'Geld', u'http://www.sueddeutsche.de/app/service/rss/ressort/finanzen/rss.xml'),
              (u'Kultur', u'http://www.sueddeutsche.de/app/service/rss/ressort/kultur/rss.xml'),
              (u'Sport', u'http://www.sueddeutsche.de/app/service/rss/ressort/sport/rss.xml'),
              (u'Leben', u'http://www.sueddeutsche.de/app/service/rss/ressort/leben/rss.xml'),
              (u'Karriere', u'http://www.sueddeutsche.de/app/service/rss/ressort/karriere/rss.xml'),
              (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'),
              (u'Bayern', u'http://www.sueddeutsche.de/app/service/rss/ressort/bayern/rss.xml'),
              (u'Medien', u'http://www.sueddeutsche.de/app/service/rss/ressort/medien/rss.xml'),
              (u'Digital', u'http://www.sueddeutsche.de/app/service/rss/ressort/computerwissen/rss.xml'),
              (u'Auto', u'http://www.sueddeutsche.de/app/service/rss/ressort/autoreise/rss.xml'),
              (u'Wissen', u'http://www.sueddeutsche.de/app/service/rss/ressort/wissen/rss.xml'),
              (u'Panorama', u'http://www.sueddeutsche.de/app/service/rss/ressort/panorama/rss.xml'),
              (u'Reise', u'http://www.sueddeutsche.de/app/service/rss/ressort/reise/rss.xml'),
            ]
-    def print_version(self, url):
+#    def print_version(self, url):             #AGe 2011-12-16 deactivated
-        main, sep, id = url.rpartition('/')
+#        main, sep, id = url.rpartition('/')   #AGe 2011-12-16 deactivated
-        return main + '/2.220/' + id
+#        return main + '/2.220/' + id          #AGe 2011-12-16 deactivated
--- a/recipes/telegraph_uk.recipe
+++ b/recipes/telegraph_uk.recipe
@ -59,6 +59,11 @@ class TelegraphUK(BasicNewsRecipe):
                        ,(u'Travel'        , u'http://www.telegraph.co.uk/travel/rss'                                            )
                        ,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss'                     )
                         ]
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def get_article_url(self, article):
        url = article.get('link', None)
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -57,6 +57,12 @@ class WallStreetJournal(BasicNewsRecipe):
                        'username and password')
        return br
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(name=['table', 'tr', 'td']):
            tag.name = 'div'
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@ -44,6 +44,12 @@ class WallStreetJournal(BasicNewsRecipe):
                    ]
    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(name=['table', 'tr', 'td']):
            tag.name = 'div'
--- a/session.vim
+++ b/session.vim
@ -1,5 +1,5 @@
 " Project wide builtins
-let g:pyflakes_builtins = ["_", "dynamic_property", "__", "P", "I", "lopen", "icu_lower", "icu_upper", "icu_title", "ngettext"]
+let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext"
 python << EOFPY
 import os, sys
--- a/setup/commands.py
+++ b/setup/commands.py
@ -11,7 +11,7 @@ __all__ = [
        'build', 'build_pdf2xml', 'server',
        'gui',
        'develop', 'install',
-        'kakasi', 'resources',
+        'kakasi', 'coffee', 'resources',
        'check',
        'sdist',
        'manual', 'tag_release',
@ -49,9 +49,10 @@ gui = GUI()
 from setup.check import Check
 check = Check()
-from setup.resources import Resources, Kakasi
+from setup.resources import Resources, Kakasi, Coffee
 resources = Resources()
 kakasi = Kakasi()
 coffee = Coffee()
 from setup.publish import Manual, TagRelease, Stage1, Stage2, \
        Stage3, Stage4, Stage5, Publish
--- a/setup/iso_639/ca.po
+++ b/setup/iso_639/ca.po
@ -12,14 +12,14 @@ msgstr ""
 "Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
 "devel@lists.alioth.debian.org>\n"
 "POT-Creation-Date: 2011-11-25 14:01+0000\n"
-"PO-Revision-Date: 2011-11-22 16:45+0000\n"
+"PO-Revision-Date: 2011-12-14 19:48+0000\n"
 "Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
 "Language-Team: Catalan <linux@softcatala.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2011-11-26 05:10+0000\n"
+"X-Launchpad-Export-Date: 2011-12-15 05:18+0000\n"
-"X-Generator: Launchpad (build 14381)\n"
+"X-Generator: Launchpad (build 14487)\n"
 "Language: ca\n"
 #. name for aaa
@ -9348,7 +9348,7 @@ msgstr "Seit-Kaitetu"
 #. name for hil
 msgid "Hiligaynon"
-msgstr ""
+msgstr "Hiligainon"
 #. name for hin
 msgid "Hindi"
@ -9356,39 +9356,39 @@ msgstr "Hindi"
 #. name for hio
 msgid "Tsoa"
-msgstr ""
+msgstr "Tsoa"
 #. name for hir
 msgid "Himarimã"
-msgstr ""
+msgstr "Himarimà"
 #. name for hit
 msgid "Hittite"
-msgstr ""
+msgstr "Hittita"
 #. name for hiw
 msgid "Hiw"
-msgstr ""
+msgstr "Hiw"
 #. name for hix
 msgid "Hixkaryána"
-msgstr ""
+msgstr "Hishkaryana"
 #. name for hji
 msgid "Haji"
-msgstr ""
+msgstr "Aji"
 #. name for hka
 msgid "Kahe"
-msgstr ""
+msgstr "Kahe"
 #. name for hke
 msgid "Hunde"
-msgstr ""
+msgstr "Hunde"
 #. name for hkk
 msgid "Hunjara-Kaina Ke"
-msgstr ""
+msgstr "Hunjara"
 #. name for hks
 msgid "Hong Kong Sign Language"
@ -9396,27 +9396,27 @@ msgstr "Llenguatge de signes de Hong Kong"
 #. name for hla
 msgid "Halia"
-msgstr ""
+msgstr "Halia"
 #. name for hlb
 msgid "Halbi"
-msgstr ""
+msgstr "Halbi"
 #. name for hld
 msgid "Halang Doan"
-msgstr ""
+msgstr "Halang Doan"
 #. name for hle
 msgid "Hlersu"
-msgstr ""
+msgstr "Sansu"
 #. name for hlt
 msgid "Nga La"
-msgstr ""
+msgstr "Nga La"
 #. name for hlu
 msgid "Luwian; Hieroglyphic"
-msgstr ""
+msgstr "Luvi; jeroglífic"
 #. name for hma
 msgid "Miao; Southern Mashan"
@ -9424,7 +9424,7 @@ msgstr "Miao; Mashan meridional"
 #. name for hmb
 msgid "Songhay; Humburi Senni"
-msgstr ""
+msgstr "Songhai; central"
 #. name for hmc
 msgid "Miao; Central Huishui"
@ -9440,11 +9440,11 @@ msgstr "Miao; Huishui oriental"
 #. name for hmf
 msgid "Hmong Don"
-msgstr ""
+msgstr "Miao; Don"
 #. name for hmg
 msgid "Hmong; Southwestern Guiyang"
-msgstr ""
+msgstr "Miao; Guiyang sudoccidental"
 #. name for hmh
 msgid "Miao; Southwestern Huishui"
@ -9456,11 +9456,11 @@ msgstr "Miao; Huishui septentrional"
 #. name for hmj
 msgid "Ge"
-msgstr ""
+msgstr "Ge"
 #. name for hmk
 msgid "Maek"
-msgstr ""
+msgstr "Maek"
 #. name for hml
 msgid "Miao; Luopohe"
@ -9472,11 +9472,11 @@ msgstr "Miao; Mashan central"
 #. name for hmn
 msgid "Hmong"
-msgstr ""
+msgstr "Hmong (macrollengua)"
 #. name for hmo
 msgid "Hiri Motu"
-msgstr ""
+msgstr "Hiri Motu"
 #. name for hmp
 msgid "Miao; Northern Mashan"
@ -9488,7 +9488,7 @@ msgstr "Miao; Qiandong oriental"
 #. name for hmr
 msgid "Hmar"
-msgstr ""
+msgstr "Hmar"
 #. name for hms
 msgid "Miao; Southern Qiandong"
@ -9496,15 +9496,15 @@ msgstr "Miao; Qiandong meridional"
 #. name for hmt
 msgid "Hamtai"
-msgstr ""
+msgstr "Hamtai"
 #. name for hmu
 msgid "Hamap"
-msgstr ""
+msgstr "Hamap"
 #. name for hmv
 msgid "Hmong Dô"
-msgstr ""
+msgstr "Miao; Do"
 #. name for hmw
 msgid "Miao; Western Mashan"
@ -9520,19 +9520,19 @@ msgstr "Miao; Shua"
 #. name for hna
 msgid "Mina (Cameroon)"
-msgstr ""
+msgstr "Mina (Camerun)"
 #. name for hnd
 msgid "Hindko; Southern"
-msgstr ""
+msgstr "Hindko; meridional"
 #. name for hne
 msgid "Chhattisgarhi"
-msgstr ""
+msgstr "Chattisgarbi"
 #. name for hnh
 msgid "//Ani"
-msgstr ""
+msgstr "Ani"
 #. name for hni
 msgid "Hani"
@ -9540,7 +9540,7 @@ msgstr ""
 #. name for hnj
 msgid "Hmong Njua"
-msgstr ""
+msgstr "Miao; Hmong Njua"
 #. name for hnn
 msgid "Hanunoo"
@ -9548,7 +9548,7 @@ msgstr ""
 #. name for hno
 msgid "Hindko; Northern"
-msgstr ""
+msgstr "Hindko; septentrional"
 #. name for hns
 msgid "Hindustani; Caribbean"
@ -11800,7 +11800,7 @@ msgstr ""
 #. name for khq
 msgid "Songhay; Koyra Chiini"
-msgstr ""
+msgstr "Songhai; Koyra"
 #. name for khr
 msgid "Kharia"
@ -17288,7 +17288,7 @@ msgstr ""
 #. name for mww
 msgid "Hmong Daw"
-msgstr ""
+msgstr "Miao; blanc"
 #. name for mwx
 msgid "Mediak"
@ -28680,7 +28680,7 @@ msgstr ""
 #. name for xlu
 msgid "Luwian; Cuneiform"
-msgstr ""
+msgstr "Luvi; cuneïforme"
 #. name for xly
 msgid "Elymian"
--- a/setup/resources.py
+++ b/setup/resources.py
@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import os, cPickle, re, shutil, marshal, zipfile, glob
+import os, cPickle, re, shutil, marshal, zipfile, glob, subprocess, time
 from zlib import compress
 from setup import Command, basenames, __appname__
@ -23,7 +23,70 @@ def get_opts_from_parser(parser):
        for o in g.option_list:
            for x in do_opt(o): yield x
-class Kakasi(Command):
+class Coffee(Command): # {{{
    description = 'Compile coffeescript files into javascript'
    COFFEE_DIRS = {'ebooks/oeb/display': 'display'}
    def add_options(self, parser):
        parser.add_option('--watch', '-w', action='store_true', default=False,
                help='Autocompile when .coffee files are changed')
        parser.add_option('--show-js', action='store_true', default=False,
                help='Display the generated javascript')
    def run(self, opts):
        self.do_coffee_compile(opts)
        if opts.watch:
            try:
                while True:
                    time.sleep(0.5)
                    self.do_coffee_compile(opts, timestamp=True,
                            ignore_errors=True)
            except KeyboardInterrupt:
                pass
    def show_js(self, jsfile):
        from pygments.lexers import JavascriptLexer
        from pygments.formatters import TerminalFormatter
        from pygments import highlight
        with open(jsfile, 'rb') as f:
            raw = f.read()
        print highlight(raw, JavascriptLexer(), TerminalFormatter())
    def do_coffee_compile(self, opts, timestamp=False, ignore_errors=False):
        for toplevel, dest in self.COFFEE_DIRS.iteritems():
            dest = self.j(self.RESOURCES, dest)
            for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
                js = self.j(dest, os.path.basename(x.rpartition('.')[0]+'.js'))
                if self.newer(js, x):
                    print ('\t%sCompiling %s'%(time.strftime('[%H:%M:%S] ') if
                        timestamp else '', os.path.basename(x)))
                    try:
                        subprocess.check_call(['coffee', '-c', '-o', dest, x])
                    except:
                        print ('\n\tCompilation of %s failed'%os.path.basename(x))
                        if ignore_errors:
                            with open(js, 'wb') as f:
                                f.write('# Compilation from coffeescript failed')
                        else:
                            raise SystemExit(1)
                    else:
                        if opts.show_js:
                            self.show_js(js)
                            print ('#'*80)
                            print ('#'*80)
    def clean(self):
        for toplevel, dest in self.COFFEE_DIRS.iteritems():
            dest = self.j(self.RESOURCES, dest)
            for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
                x = x.rpartition('.')[0] + '.js'
                x = self.j(dest, os.path.basename(x))
                if os.path.exists(x):
                    os.remove(x)
 # }}}
 class Kakasi(Command): # {{{
    description = 'Compile resources for unihandecode'
@ -62,9 +125,6 @@ class Kakasi(Command):
            self.info('\tGenerating kanadict')
            self.mkkanadict(src, dest)
        return
    def mkitaiji(self, src, dst):
        dic = {}
        for line in open(src, "r"):
@ -125,11 +185,12 @@ class Kakasi(Command):
        kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
        if os.path.exists(kakasi):
            shutil.rmtree(kakasi)
 # }}}
-class Resources(Command):
+class Resources(Command): # {{{
    description = 'Compile various needed calibre resources'
-    sub_commands = ['kakasi']
+    sub_commands = ['kakasi', 'coffee']
    def run(self, opts):
        scripts = {}
@ -223,13 +284,13 @@ class Resources(Command):
            x = self.j(self.RESOURCES, x+'.pickle')
            if os.path.exists(x):
                os.remove(x)
-        from setup.commands import kakasi
+        from setup.commands import kakasi, coffee
        kakasi.clean()
        coffee.clean()
        for x in ('builtin_recipes.xml', 'builtin_recipes.zip',
                'template-functions.json'):
            x = self.j(self.RESOURCES, x)
            if os.path.exists(x):
                os.remove(x)
-
+# }}}
--- a/setup/translations.py
+++ b/setup/translations.py
@ -215,32 +215,34 @@ class GetTranslations(Translations): # {{{
    description = 'Get updated translations from Launchpad'
    BRANCH = 'lp:~kovid/calibre/translations'
-    @classmethod
+    @property
-    def modified_translations(cls):
+    def modified_translations(self):
-        raw = subprocess.Popen(['bzr', 'status'],
+        raw = subprocess.Popen(['bzr', 'status', '-S', self.PATH],
                stdout=subprocess.PIPE).stdout.read().strip()
        ans = []
        for line in raw.splitlines():
            line = line.strip()
-            if line.startswith(cls.PATH) and line.endswith('.po'):
+            if line.startswith('M') and line.endswith('.po'):
-                yield line
+                ans.append(line.split()[-1])
        return ans
    def run(self, opts):
-        if len(list(self.modified_translations())) == 0:
+        if not self.modified_translations:
            subprocess.check_call(['bzr', 'merge', self.BRANCH])
        if len(list(self.modified_translations())) == 0:
            print 'No updated translations available'
        else:
            subprocess.check_call(['bzr', 'commit', '-m',
                'IGN:Updated translations', self.PATH])
        self.check_for_errors()
-    @classmethod
+        if self.modified_translations:
-    def check_for_errors(cls):
+            subprocess.check_call(['bzr', 'commit', '-m',
                'IGN:Updated translations', self.PATH])
        else:
            print('No updated translations available')
    def check_for_errors(self):
        errors = os.path.join(tempfile.gettempdir(), 'calibre-translation-errors')
        if os.path.exists(errors):
            shutil.rmtree(errors)
        os.mkdir(errors)
-        pofilter = ('pofilter', '-i', cls.PATH, '-o', errors,
+        pofilter = ('pofilter', '-i', self.PATH, '-o', errors,
                '-t', 'accelerators', '-t', 'escapes', '-t', 'variables',
                #'-t', 'xmltags',
                #'-t', 'brackets',
@ -253,23 +255,20 @@ class GetTranslations(Translations): # {{{
                '-t', 'printf')
        subprocess.check_call(pofilter)
        errfiles = glob.glob(errors+os.sep+'*.po')
-        subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles)
+        if errfiles:
-        for f in errfiles:
+            subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles)
-            with open(f, 'r+b') as f:
+            for f in errfiles:
-                raw = f.read()
+                with open(f, 'r+b') as f:
-                raw = re.sub(r'# \(pofilter\).*', '', raw)
+                    raw = f.read()
-                f.seek(0)
+                    raw = re.sub(r'# \(pofilter\).*', '', raw)
-                f.truncate()
+                    f.seek(0)
-                f.write(raw)
+                    f.truncate()
                    f.write(raw)
-        subprocess.check_call(['pomerge', '-t', cls.PATH, '-i', errors, '-o',
+            subprocess.check_call(['pomerge', '-t', self.PATH, '-i', errors, '-o',
-            cls.PATH])
+                self.PATH])
-        if len(list(cls.modified_translations())) > 0:
+            return True
-            subprocess.call(['bzr', 'diff', cls.PATH])
+        return False
            yes = raw_input('Merge corrections? [y/n]: ').strip()
            if yes in ['', 'y']:
                subprocess.check_call(['bzr', 'commit', '-m',
                    'IGN:Translation corrections', cls.PATH])
 # }}}
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -558,11 +558,11 @@ xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = {
    '>' : '&gt;',
    '&' : '&amp;'})
-def replace_entities(raw):
+def replace_entities(raw, encoding='cp1252'):
-    return _ent_pat.sub(entity_to_unicode, raw)
+    return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
-def xml_replace_entities(raw):
+def xml_replace_entities(raw, encoding='cp1252'):
-    return _ent_pat.sub(xml_entity_to_unicode, raw)
+    return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
 def prepare_string_for_xml(raw, attribute=False):
    raw = _ent_pat.sub(entity_to_unicode, raw)
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -4,7 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = u'calibre'
-numeric_version = (0, 8, 30)
+numeric_version = (0, 8, 31)
 __version__   = u'.'.join(map(unicode, numeric_version))
 __author__    = u"Kovid Goyal <kovid@kovidgoyal.net>"
--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@ -173,8 +173,9 @@ class INVESBOOK(EB600):
    FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'pdf', 'rtf', 'txt']
    BCD         = [0x110, 0x323]
-    VENDOR_NAME = ['INVES_E6', 'INVES-WI']
+    VENDOR_NAME = ['INVES_E6', 'INVES-WI', 'POCKETBO']
-    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK']
+    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK',
            'OK_POCKET_611_61']
 class BOOQ(EB600):
    name = 'Booq Device Interface'
--- a/src/calibre/ebooks/init.py
+++ b/src/calibre/ebooks/init.py
@ -30,7 +30,7 @@ BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'ht
                   'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
                   'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
                   'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
-                   'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi']
+                   'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx']
 class HTMLRenderer(object):
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@ -229,7 +229,10 @@ class EPUBOutput(OutputFormatPlugin):
            if opts.extract_to is not None:
                from calibre.utils.zipfile import ZipFile
                if os.path.exists(opts.extract_to):
-                    shutil.rmtree(opts.extract_to)
+                    if os.path.isdir(opts.extract_to):
                        shutil.rmtree(opts.extract_to)
                    else:
                        os.remove(opts.extract_to)
                os.mkdir(opts.extract_to)
                with ZipFile(output_path) as zf:
                    zf.extractall(path=opts.extract_to)
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -16,7 +16,8 @@ from lxml.html import tostring
 from calibre import as_unicode
 from calibre.ebooks.metadata import check_isbn
-from calibre.ebooks.metadata.sources.base import Source, Option
+from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
        fixauthors)
 from calibre.utils.cleantext import clean_ascii_chars
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata.book.base import Metadata
@ -509,6 +510,15 @@ class Amazon(Source):
        return domain
    def clean_downloaded_metadata(self, mi):
        if mi.title and self.domain in ('com', 'uk'):
            mi.title = fixcase(mi.title)
        mi.authors = fixauthors(mi.authors)
        if self.domain in ('com', 'uk'):
            mi.tags = list(map(fixcase, mi.tags))
        mi.isbn = check_isbn(mi.isbn)
    def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
            domain=None):
        if domain is None:
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -31,7 +31,7 @@ class TOC(list):
    def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
                 base_path=os.getcwd(), type='unknown', author=None,
-                 description=None):
+                 description=None, toc_thumbnail=None):
        self.href = href
        self.fragment = fragment
        if not self.fragment:
@ -43,6 +43,7 @@ class TOC(list):
        self.type = type
        self.author = author
        self.description = description
        self.toc_thumbnail = toc_thumbnail
    def __str__(self):
        lines = ['TOC: %s#%s'%(self.href, self.fragment)]
@ -72,12 +73,12 @@ class TOC(list):
        entry.parent = None
    def add_item(self, href, fragment, text, play_order=None, type='unknown',
-            author=None, description=None):
+            author=None, description=None, toc_thumbnail=None):
        if play_order is None:
            play_order = (self[-1].play_order if len(self) else self.play_order) + 1
        self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
                        base_path=self.base_path, play_order=play_order,
-                        type=type, author=author, description=description))
+                        type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
        return self[-1]
    def top_level_items(self):
@ -269,6 +270,9 @@ class TOC(list):
            if desc:
                desc = re.sub(r'\s+', ' ', desc)
                elem.append(C.meta(desc, name='description'))
            idx = getattr(np, 'toc_thumbnail', None)
            if idx:
                elem.append(C.meta(idx, name='toc_thumbnail'))   
            parent.append(elem)
            for np2 in np:
                navpoint(elem, np2)
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -656,11 +656,11 @@ class Tag(object): # {{{
                        ' image record associated with this article',
                        'image_index'),
                    70 : ('Description offset in cncx', 'desc_offset'),
-                    71 : ('Image attribution offset in cncx',
+                    71 : ('Author offset in cncx', 'author_offset'),
                        'image_attr_offset'),
                    72 : ('Image caption offset in cncx',
                        'image_caption_offset'),
-                    73 : ('Author offset in cncx', 'author_offset'),
+                    73 : ('Image attribution offset in cncx',
                        'image_attr_offset'),
            },
            'chapter_with_subchapters' : {
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -973,7 +973,8 @@ class MobiReader(object):
                continue
            processed_records.append(i)
            data  = self.sections[i][0]
-            if data[:4] in (b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n'):
+            if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
                    b'RESC', b'BOUN', b'FDST', b'DATP'}:
                # A FLIS, FCIS, SRCS or EOF record, ignore
                continue
            buf = cStringIO.StringIO(data)
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -136,7 +136,8 @@ class IndexEntry(object):
            'last_child_index': 23,
            'image_index': 69,
            'desc_offset': 70,
-            'author_offset': 73,
+            'author_offset': 71,
    }
    RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}
@ -754,6 +755,13 @@ class Indexer(object): # {{{
                normalized_articles.append(article)
                article.author_offset = self.cncx[art.author]
                article.desc_offset = self.cncx[art.description]
                if getattr(art, 'toc_thumbnail', None) is not None:
                    try:
                        ii = self.serializer.images[art.toc_thumbnail] - 1
                        if ii > -1:
                            article.image_index = ii
                    except KeyError:
                        pass # Image not found in serializer
            if normalized_articles:
                normalized_articles.sort(key=lambda x:x.offset)
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -161,7 +161,7 @@ class MobiWriter(object):
        index = 1
        mh_href = None
-        if 'masthead' in oeb.guide:
+        if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
            mh_href = oeb.guide['masthead'].href
            self.image_records.append(None)
            index += 1
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -16,15 +16,13 @@ from urllib import unquote as urlunquote
 from lxml import etree, html
 from calibre.constants import filesystem_encoding, __version__
 from calibre.translations.dynamic import translate
-from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
+from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.conversion.preprocess import CSSPreProcessor
-from calibre import isbytestring, as_unicode, get_types_map
+from calibre import (isbytestring, as_unicode, get_types_map)
-
+from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
-RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
+        namespace, XHTML, parse_html, NotHTML)
 XML_NS       = 'http://www.w3.org/XML/1998/namespace'
 XHTML_NS     = 'http://www.w3.org/1999/xhtml'
 OEB_DOC_NS   = 'http://openebook.org/namespaces/oeb-document/1.0/'
 OPF1_NS      = 'http://openebook.org/namespaces/oeb-package/1.0/'
 OPF2_NS      = 'http://www.idpf.org/2007/opf'
@ -55,9 +53,6 @@ OPF2_NSMAP   = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
 def XML(name):
    return '{%s}%s' % (XML_NS, name)
 def XHTML(name):
    return '{%s}%s' % (XHTML_NS, name)
 def OPF(name):
    return '{%s}%s' % (OPF2_NS, name)
@ -279,22 +274,11 @@ PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
 CSSURL_RE     = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
 def element(parent, *args, **kwargs):
    if parent is not None:
        return etree.SubElement(parent, *args, **kwargs)
    return etree.Element(*args, **kwargs)
 def namespace(name):
    if '}' in name:
        return name.split('}', 1)[0][1:]
    return ''
 def barename(name):
    if '}' in name:
        return name.split('}', 1)[1]
    return name
 def prefixname(name, nsrmap):
    if not isqname(name):
        return name
@ -373,25 +357,6 @@ def urlnormalize(href):
    parts = (urlquote(part) for part in parts)
    return urlunparse(parts)
 def merge_multiple_html_heads_and_bodies(root, log=None):
    heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
    if not (len(heads) > 1 or len(bodies) > 1): return root
    for child in root: root.remove(child)
    head = root.makeelement(XHTML('head'))
    body = root.makeelement(XHTML('body'))
    for h in heads:
        for x in h:
            head.append(x)
    for b in bodies:
        for x in b:
            body.append(x)
    map(root.append, (head, body))
    if log is not None:
        log.warn('Merging multiple <head> and <body> sections')
    return root
 class DummyHandler(logging.Handler):
@ -418,10 +383,6 @@ class OEBError(Exception):
    """Generic OEB-processing error."""
    pass
 class NotHTML(OEBError):
    '''Raised when a file that should be HTML (as per manifest) is not'''
    pass
 class NullContainer(object):
    """An empty container.
@ -801,7 +762,6 @@ class Manifest(object):
        """
        NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
        META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
        def __init__(self, oeb, id, href, media_type,
                     fallback=None, loader=str, data=None):
@ -830,244 +790,17 @@ class Manifest(object):
                return None
            return etree.fromstring(data, parser=RECOVER_PARSER)
        def clean_word_doc(self, data):
            prefixes = []
            for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
                prefixes.append(match.group(1))
            if prefixes:
                self.oeb.log.warn('Found microsoft markup, cleaning...')
                # Remove empty tags as they are not rendered by browsers
                # but can become renderable HTML tags like <p/> if the
                # document is parsed by an HTML parser
                pat = re.compile(
                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
                        re.DOTALL)
                data = pat.sub('', data)
                pat = re.compile(
                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
                data = pat.sub('', data)
            return data
        def _parse_xhtml(self, data):
            orig_data = data
-            self.oeb.log.debug('Parsing', self.href, '...')
+            fname = urlunquote(self.href)
-            # Convert to Unicode and normalize line endings
+            self.oeb.log.debug('Parsing', fname, '...')
            data = self.oeb.decode(data)
            data = strip_encoding_declarations(data)
            data = self.oeb.html_preprocessor(data)
            # There could be null bytes in data if it had &#0; entities in it
            data = data.replace('\0', '')
            # Remove DOCTYPE declaration as it messes up parsing
            # In particular, it causes tostring to insert xmlns
            # declarations, which messes up the coercing logic
            idx = data.find('<html')
            if idx == -1:
                idx = data.find('<HTML')
            if idx > -1:
                pre = data[:idx]
                data = data[idx:]
                if '<!DOCTYPE' in pre:
                    user_entities = {}
                    for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                        val = match.group(2)
                        if val.startswith('"') and val.endswith('"'):
                            val = val[1:-1]
                        user_entities[match.group(1)] = val
                    if user_entities:
                        pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
                        data = pat.sub(lambda m:user_entities[m.group(1)], data)
            # Setting huge_tree=True causes crashes in windows with large files
            parser = etree.XMLParser(no_network=True)
            # Try with more & more drastic measures to parse
            def first_pass(data):
                try:
                    data = etree.fromstring(data, parser=parser)
                except etree.XMLSyntaxError as err:
                    self.oeb.log.debug('Initial parse failed, using more'
                            ' forgiving parsers')
                    repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
                    data = ENTITY_RE.sub(repl, data)
                    try:
                        data = etree.fromstring(data, parser=parser)
                    except etree.XMLSyntaxError as err:
                        self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
                        if err.args and err.args[0].startswith('Excessive depth'):
                            from calibre.utils.soupparser import fromstring
                            data = fromstring(data)
                        else:
                            data = html.fromstring(data)
                        data.attrib.pop('xmlns', None)
                        for elem in data.iter(tag=etree.Comment):
                            if elem.text:
                                elem.text = elem.text.strip('-')
                        data = etree.tostring(data, encoding=unicode)
                        try:
                            data = etree.fromstring(data, parser=parser)
                        except etree.XMLSyntaxError:
                            data = etree.fromstring(data, parser=RECOVER_PARSER)
                return data
            try:
-                data = self.clean_word_doc(data)
+                data = parse_html(data, log=self.oeb.log,
-            except:
+                        decoder=self.oeb.decode,
-                pass
+                        preprocessor=self.oeb.html_preprocessor,
-            data = first_pass(data)
+                        filename=fname, non_html_file_tags={'ncx'})
-
+            except NotHTML:
-            if data.tag == 'HTML':
+                return self._parse_xml(orig_data)
                # Lower case all tag and attribute names
                data.tag = data.tag.lower()
                for x in data.iterdescendants():
                    try:
                        x.tag = x.tag.lower()
                        for key, val in list(x.attrib.iteritems()):
                            del x.attrib[key]
                            key = key.lower()
                            x.attrib[key] = val
                    except:
                        pass
            # Handle weird (non-HTML/fragment) files
            if barename(data.tag) != 'html':
                if barename(data.tag) == 'ncx':
                    return self._parse_xml(orig_data)
                self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
                nroot = etree.fromstring('<html></html>')
                has_body = False
                for child in list(data):
                    if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
                        has_body = True
                        break
                parent = nroot
                if not has_body:
                    self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
                    nroot = etree.fromstring('<html><body/></html>')
                    parent = nroot[0]
                for child in list(data.iter()):
                    oparent = child.getparent()
                    if oparent is not None:
                        oparent.remove(child)
                    parent.append(child)
                data = nroot
            # Force into the XHTML namespace
            if not namespace(data.tag):
                self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
                data.attrib['xmlns'] = XHTML_NS
                data = etree.tostring(data, encoding=unicode)
                try:
                    data = etree.fromstring(data, parser=parser)
                except:
                    data = data.replace(':=', '=').replace(':>', '>')
                    data = data.replace('<http:/>', '')
                    try:
                        data = etree.fromstring(data, parser=parser)
                    except etree.XMLSyntaxError:
                        self.oeb.logger.warn('Stripping comments from %s'%
                                self.href)
                        data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
                                data)
                        data = data.replace(
                            "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
                            '')
                        data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
                        try:
                            data = etree.fromstring(data,
                                    parser=RECOVER_PARSER)
                        except etree.XMLSyntaxError:
                            self.oeb.logger.warn('Stripping meta tags from %s'%
                                self.href)
                            data = re.sub(r'<meta\s+[^>]+?>', '', data)
                            data = etree.fromstring(data, parser=RECOVER_PARSER)
            elif namespace(data.tag) != XHTML_NS:
                # OEB_DOC_NS, but possibly others
                ns = namespace(data.tag)
                attrib = dict(data.attrib)
                nroot = etree.Element(XHTML('html'),
                    nsmap={None: XHTML_NS}, attrib=attrib)
                for elem in data.iterdescendants():
                    if isinstance(elem.tag, basestring) and \
                       namespace(elem.tag) == ns:
                        elem.tag = XHTML(barename(elem.tag))
                for elem in data:
                    nroot.append(elem)
                data = nroot
            data = merge_multiple_html_heads_and_bodies(data, self.oeb.logger)
            # Ensure has a <head/>
            head = xpath(data, '/h:html/h:head')
            head = head[0] if head else None
            if head is None:
                self.oeb.logger.warn(
                    'File %r missing <head/> element' % self.href)
                head = etree.Element(XHTML('head'))
                data.insert(0, head)
                title = etree.SubElement(head, XHTML('title'))
                title.text = self.oeb.translate(__('Unknown'))
            elif not xpath(data, '/h:html/h:head/h:title'):
                self.oeb.logger.warn(
                    'File %r missing <title/> element' % self.href)
                title = etree.SubElement(head, XHTML('title'))
                title.text = self.oeb.translate(__('Unknown'))
            # Remove any encoding-specifying <meta/> elements
            for meta in self.META_XP(data):
                meta.getparent().remove(meta)
            etree.SubElement(head, XHTML('meta'),
                attrib={'http-equiv': 'Content-Type',
                        'content': '%s; charset=utf-8' % XHTML_NS})
            # Ensure has a <body/>
            if not xpath(data, '/h:html/h:body'):
                body = xpath(data, '//h:body')
                if body:
                    body = body[0]
                    body.getparent().remove(body)
                    data.append(body)
                else:
                    self.oeb.logger.warn(
                        'File %r missing <body/> element' % self.href)
                    etree.SubElement(data, XHTML('body'))
            # Remove microsoft office markup
            r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
            for x in r:
                x.tag = XHTML('span')
            # Remove lang redefinition inserted by the amazing Microsoft Word!
            body = xpath(data, '/h:html/h:body')[0]
            for key in list(body.attrib.keys()):
                if key == 'lang' or key.endswith('}lang'):
                    body.attrib.pop(key)
            def remove_elem(a):
                p = a.getparent()
                idx = p.index(a) -1
                p.remove(a)
                if a.tail:
                    if idx <= 0:
                        if p.text is None:
                            p.text = ''
                        p.text += a.tail
                    else:
                        if p[idx].tail is None:
                            p[idx].tail = ''
                        p[idx].tail += a.tail
            # Remove hyperlinks with no content as they cause rendering
            # artifacts in browser based renderers
            # Also remove empty <b>, <u> and <i> tags
            for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
                if a.get('id', None) is None and a.get('name', None) is None \
                        and len(a) == 0 and not a.text:
                    remove_elem(a)
            # Convert <br>s with content into paragraphs as ADE can't handle
            # them
            for br in xpath(data, '//h:br'):
                if len(br) > 0 or br.text:
                    br.tag = XHTML('div')
            return data
        def _parse_txt(self, data):
@ -1629,9 +1362,10 @@ class TOC(object):
    :attr:`id`: Option unique identifier for this node.
    :attr:`author`: Optional author attribution for periodicals <mbp:>
    :attr:`description`: Optional description attribute for periodicals <mbp:>
    :attr:`toc_thumbnail`: Optional toc thumbnail image
    """
    def __init__(self, title=None, href=None, klass=None, id=None,
-            play_order=None, author=None, description=None):
+            play_order=None, author=None, description=None, toc_thumbnail=None):
        self.title = title
        self.href = urlnormalize(href) if href else href
        self.klass = klass
@ -1643,10 +1377,11 @@ class TOC(object):
        self.play_order = play_order
        self.author = author
        self.description = description
        self.toc_thumbnail = toc_thumbnail
-    def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None):
+    def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
        """Create and return a new sub-node of this node."""
-        node = TOC(title, href, klass, id, play_order, author, description)
+        node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
        self.nodes.append(node)
        return node
--- a/src/calibre/ebooks/oeb/display/cfi.coffee
+++ b/src/calibre/ebooks/oeb/display/cfi.coffee
@ -0,0 +1,225 @@
 #!/usr/bin/env coffee
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 ###
 Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
 Released under the GPLv3 License
 ###
 #
 log = (error) ->
    if error
        if window?.console?.log
            window.console.log(error)
        else if process?.stdout?.write
            process.stdout.write(error + '\n')
 # CFI escaping {{{
 escape_for_cfi = (raw) ->
    if raw
        for c in ['^', '[', ']', ',', '(', ')', ';', '~', '@', '-', '!']
            raw = raw.replace(c, '^'+c)
    raw
 unescape_from_cfi = (raw) ->
    ans = raw
    if raw
        dropped = false
        ans = []
        for c in raw
            if not dropped and c == '^'
                dropped = true
                continue
            dropped = false
            ans.push(c)
        ans = ans.join('')
    ans
 # }}}
 fstr = (d) -> # {{{
    # Convert a timestamp floating point number to a string
    ans = ""
    if ( d < 0 )
        ans = "-"
        d = -d
    n = Math.floor(d)
    ans += n
    n = Math.round((d-n)*100)
    if( n != 0 )
        ans += "."
        ans += if (n % 10 == 0) then (n/10) else n
    ans
 # }}}
 class CanonicalFragmentIdentifier
    # This class is a namespace to expose CFI functions via the window.cfi
    # object
    constructor: () ->
    encode: (doc, node, offset, tail) -> # {{{
        cfi = tail or ""
        # Handle the offset, if any
        switch node.nodeType
            when 1 # Element node
                if typeoff(offset) == 'number'
                    node = node.childNodes.item(offset)
            when 3, 4, 5, 6 # Text/entity/CDATA node
                offset or= 0
                while true
                    p = node.previousSibling
                    if (p?.nodeType not in [3, 4, 5, 6])
                        break
                    offset += p.nodeValue.length
                    node = p
                cfi = ":" + offset + cfi
            else # Not handled
                log("Offsets for nodes of type #{ node.nodeType } are not handled")
        # Construct the path to node from root
        until node == doc
            p = node.parentNode
            if not p
                if node.nodeType == 9 # Document node (iframe)
                    win = node.defaultView
                    if win.frameElement
                        node = win.frameElement
                        cfi = "!" + cfi
                        continue
                break
            # Increase index by the length of all previous sibling text nodes
            index = 0
            child = p.firstChild
            while true
                index |= 1
                if child.nodeType in [1, 7]
                    index++
                if child == node
                    break
                child = child.nextSibling
            # Add id assertions for robustness where possible
            id = node.getAttribute?('id')
            idspec = if id then "[#{ escape_for_cfi(id) }]" else ''
            cfi = '/' + index + idspec + cfi
            node = p
        cfi
    # }}}
    decode: (cfi, doc=window?.document) -> # {{{
        simple_node_regex = ///
            ^/(\d+)          # The node count
              (\[[^\]]*\])?  # The optional id assertion
        ///
        error = null
        node = doc
        until cfi.length <= 0 or error
            if ( (r = cfi.match(simple_node_regex)) is not null ) # Path step
                target = parseInt(r[1])
                assertion = r[2]
                if assertion
                    assertion = unescape_from_cfi(assertion.slice(1, assertion.length-1))
                index = 0
                child = node.firstChild
                while true
                    if not child
                        if assertion # Try to use the assertion to find the node
                            child = doc.getElementById(assertion)
                            if child
                                node = child
                        if not child
                            error = "No matching child found for CFI: " + cfi
                        break
                    index |= 1 # Increment index by 1 if it is even
                    if child.nodeType in [1, 7] # We have an element or a PI
                        index++
                    if ( index == target )
                        cfi = cfi.substr(r[0].length)
                        node = child
                        break
                    child = child.nextSibling
            else if cfi[0] == '!' # Indirection
                if node.contentDocument
                    node = node.contentDocument
                    cfi = cfi.substr(1)
                else
                    error = "Cannot reference #{ node.nodeName }'s content:" + cfi
            else
                break
        if error
            log(error)
            return null
        point = {}
        error = null
        point
    # }}}
    at: (x, y, doc=window?.document) -> # {{{
        cdoc = doc
        target = null
        cwin = cdoc.defaultView
        tail = ''
        offset = null
        name = null
        # Drill down into iframes, etc.
        while true
            target = cdoc.elementFromPoint x, y
            if not target or target.localName == 'html'
                log("No element at (#{ x }, #{ y })")
                return null
            name = target.localName
            if name not in ['iframe', 'embed', 'object']
                break
            cd = target.contentDocument
            if not cd
                break
            x = x + cwin.pageXOffset - target.offsetLeft
            y = y + cwin.pageYOffset - target.offsetTop
            cdoc = cd
            cwin = cdoc.defaultView
        target.normalize()
        if name in ['audio', 'video']
            tail = "~" + fstr target.currentTime
        if name in ['img', 'video']
            px = ((x + cwin.scrollX - target.offsetLeft)*100)/target.offsetWidth
            py = ((y + cwin.scrollY - target.offsetTop)*100)/target.offsetHeight
            tail = "#{ tail }@#{ fstr px },#{ fstr py }"
        else if name != 'audio'
            if cdoc.caretRangeFromPoint # WebKit
                range = cdoc.caretRangeFromPoint(x, y)
                if range
                    target = range.startContainer
                    offset = range.startOffset
            else
                # TODO: implement a span bisection algorithm for UAs
                # without caretRangeFromPoint (Gecko, IE)
        this.encode(doc, target, offset, tail)
    # }}}
 if window?
    window.cfi = new CanonicalFragmentIdentifier()
 else if process?
    # Some debugging code goes here to be run with the coffee interpreter
    cfi = new CanonicalFragmentIdentifier()
    t = 'a^!,1'
    log(t)
    log(escape_for_cfi(t))
    log(unescape_from_cfi(escape_for_cfi(t)))
--- a/src/calibre/ebooks/oeb/display/test/cfi-test.coffee
+++ b/src/calibre/ebooks/oeb/display/test/cfi-test.coffee
@ -0,0 +1,24 @@
 #!/usr/bin/env coffee
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 ###
 Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
 Released under the GPLv3 License
 ###
 viewport_top = (node) ->
    $(node).offset().top - window.pageYOffset
 viewport_left = (node) ->
    $(node).offset().left - window.pageXOffset
 window.onload = ->
    h1 = document.getElementsByTagName('h1')[0]
    x = h1.scrollLeft + 150
    y = viewport_top(h1) + h1.offsetHeight/2
    e = document.elementFromPoint x, y
    if e.getAttribute('id') != 'first-h1'
        alert 'Failed to find top h1'
        return
    alert window.cfi.at x, y
--- a/src/calibre/ebooks/oeb/display/test/test.html
+++ b/src/calibre/ebooks/oeb/display/test/test.html
@ -0,0 +1,14 @@
 <!DOCTYPE html>
 <html>
    <head>
        <title>Testing CFI functionality</title>
        <script type="text/javascript" src="cfi.js"></script>
        <script type="text/javascript" src="jquery.js"></script>
        <script type="text/javascript" src="cfi-test.js"></script>
    </head>
    <body>
        <h1 id="first-h1" style="border: solid 1px red">Testing CFI functionality</h1>
    </body>
 </html>
--- a/src/calibre/ebooks/oeb/display/test/test.py
+++ b/src/calibre/ebooks/oeb/display/test/test.py
@ -0,0 +1,26 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os
 try:
    from calibre.utils.coffeescript import serve
 except ImportError:
    import init_calibre
    if False: init_calibre, serve
    from calibre.utils.coffeescript import serve
 def run_devel_server():
    os.chdir(os.path.dirname(__file__))
    serve(['../cfi.coffee', 'cfi-test.coffee'])
 if __name__ == '__main__':
    run_devel_server()
--- a/src/calibre/ebooks/oeb/entitydefs.py
+++ b/src/calibre/ebooks/oeb/entitydefs.py
@ -1,256 +0,0 @@
 """
 Replacement for htmlentitydefs which uses purely numeric entities.
 """
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 ENTITYDEFS = \
    {'AElig': '&#198;',
     'Aacute': '&#193;',
     'Acirc': '&#194;',
     'Agrave': '&#192;',
     'Alpha': '&#913;',
     'Aring': '&#197;',
     'Atilde': '&#195;',
     'Auml': '&#196;',
     'Beta': '&#914;',
     'Ccedil': '&#199;',
     'Chi': '&#935;',
     'Dagger': '&#8225;',
     'Delta': '&#916;',
     'ETH': '&#208;',
     'Eacute': '&#201;',
     'Ecirc': '&#202;',
     'Egrave': '&#200;',
     'Epsilon': '&#917;',
     'Eta': '&#919;',
     'Euml': '&#203;',
     'Gamma': '&#915;',
     'Iacute': '&#205;',
     'Icirc': '&#206;',
     'Igrave': '&#204;',
     'Iota': '&#921;',
     'Iuml': '&#207;',
     'Kappa': '&#922;',
     'Lambda': '&#923;',
     'Mu': '&#924;',
     'Ntilde': '&#209;',
     'Nu': '&#925;',
     'OElig': '&#338;',
     'Oacute': '&#211;',
     'Ocirc': '&#212;',
     'Ograve': '&#210;',
     'Omega': '&#937;',
     'Omicron': '&#927;',
     'Oslash': '&#216;',
     'Otilde': '&#213;',
     'Ouml': '&#214;',
     'Phi': '&#934;',
     'Pi': '&#928;',
     'Prime': '&#8243;',
     'Psi': '&#936;',
     'Rho': '&#929;',
     'Scaron': '&#352;',
     'Sigma': '&#931;',
     'THORN': '&#222;',
     'Tau': '&#932;',
     'Theta': '&#920;',
     'Uacute': '&#218;',
     'Ucirc': '&#219;',
     'Ugrave': '&#217;',
     'Upsilon': '&#933;',
     'Uuml': '&#220;',
     'Xi': '&#926;',
     'Yacute': '&#221;',
     'Yuml': '&#376;',
     'Zeta': '&#918;',
     'aacute': '&#225;',
     'acirc': '&#226;',
     'acute': '&#180;',
     'aelig': '&#230;',
     'agrave': '&#224;',
     'alefsym': '&#8501;',
     'alpha': '&#945;',
     'and': '&#8743;',
     'ang': '&#8736;',
     'aring': '&#229;',
     'asymp': '&#8776;',
     'atilde': '&#227;',
     'auml': '&#228;',
     'bdquo': '&#8222;',
     'beta': '&#946;',
     'brvbar': '&#166;',
     'bull': '&#8226;',
     'cap': '&#8745;',
     'ccedil': '&#231;',
     'cedil': '&#184;',
     'cent': '&#162;',
     'chi': '&#967;',
     'circ': '&#710;',
     'clubs': '&#9827;',
     'cong': '&#8773;',
     'copy': '&#169;',
     'crarr': '&#8629;',
     'cup': '&#8746;',
     'curren': '&#164;',
     'dArr': '&#8659;',
     'dagger': '&#8224;',
     'darr': '&#8595;',
     'deg': '&#176;',
     'delta': '&#948;',
     'diams': '&#9830;',
     'divide': '&#247;',
     'eacute': '&#233;',
     'ecirc': '&#234;',
     'egrave': '&#232;',
     'empty': '&#8709;',
     'emsp': '&#8195;',
     'ensp': '&#8194;',
     'epsilon': '&#949;',
     'equiv': '&#8801;',
     'eta': '&#951;',
     'eth': '&#240;',
     'euml': '&#235;',
     'euro': '&#8364;',
     'exist': '&#8707;',
     'fnof': '&#402;',
     'forall': '&#8704;',
     'frac12': '&#189;',
     'frac14': '&#188;',
     'frac34': '&#190;',
     'frasl': '&#8260;',
     'gamma': '&#947;',
     'ge': '&#8805;',
     'hArr': '&#8660;',
     'harr': '&#8596;',
     'hearts': '&#9829;',
     'hellip': '&#8230;',
     'iacute': '&#237;',
     'icirc': '&#238;',
     'iexcl': '&#161;',
     'igrave': '&#236;',
     'image': '&#8465;',
     'infin': '&#8734;',
     'int': '&#8747;',
     'iota': '&#953;',
     'iquest': '&#191;',
     'isin': '&#8712;',
     'iuml': '&#239;',
     'kappa': '&#954;',
     'lArr': '&#8656;',
     'lambda': '&#955;',
     'lang': '&#9001;',
     'laquo': '&#171;',
     'larr': '&#8592;',
     'lceil': '&#8968;',
     'ldquo': '&#8220;',
     'le': '&#8804;',
     'lfloor': '&#8970;',
     'lowast': '&#8727;',
     'loz': '&#9674;',
     'lrm': '&#8206;',
     'lsaquo': '&#8249;',
     'lsquo': '&#8216;',
     'macr': '&#175;',
     'mdash': '&#8212;',
     'micro': '&#181;',
     'middot': '&#183;',
     'minus': '&#8722;',
     'mu': '&#956;',
     'nabla': '&#8711;',
     'nbsp': '&#160;',
     'ndash': '&#8211;',
     'ne': '&#8800;',
     'ni': '&#8715;',
     'not': '&#172;',
     'notin': '&#8713;',
     'nsub': '&#8836;',
     'ntilde': '&#241;',
     'nu': '&#957;',
     'oacute': '&#243;',
     'ocirc': '&#244;',
     'oelig': '&#339;',
     'ograve': '&#242;',
     'oline': '&#8254;',
     'omega': '&#969;',
     'omicron': '&#959;',
     'oplus': '&#8853;',
     'or': '&#8744;',
     'ordf': '&#170;',
     'ordm': '&#186;',
     'oslash': '&#248;',
     'otilde': '&#245;',
     'otimes': '&#8855;',
     'ouml': '&#246;',
     'para': '&#182;',
     'part': '&#8706;',
     'permil': '&#8240;',
     'perp': '&#8869;',
     'phi': '&#966;',
     'pi': '&#960;',
     'piv': '&#982;',
     'plusmn': '&#177;',
     'pound': '&#163;',
     'prime': '&#8242;',
     'prod': '&#8719;',
     'prop': '&#8733;',
     'psi': '&#968;',
     'rArr': '&#8658;',
     'radic': '&#8730;',
     'rang': '&#9002;',
     'raquo': '&#187;',
     'rarr': '&#8594;',
     'rceil': '&#8969;',
     'rdquo': '&#8221;',
     'real': '&#8476;',
     'reg': '&#174;',
     'rfloor': '&#8971;',
     'rho': '&#961;',
     'rlm': '&#8207;',
     'rsaquo': '&#8250;',
     'rsquo': '&#8217;',
     'sbquo': '&#8218;',
     'scaron': '&#353;',
     'sdot': '&#8901;',
     'sect': '&#167;',
     'shy': '&#173;',
     'sigma': '&#963;',
     'sigmaf': '&#962;',
     'sim': '&#8764;',
     'spades': '&#9824;',
     'sub': '&#8834;',
     'sube': '&#8838;',
     'sum': '&#8721;',
     'sup': '&#8835;',
     'sup1': '&#185;',
     'sup2': '&#178;',
     'sup3': '&#179;',
     'supe': '&#8839;',
     'szlig': '&#223;',
     'tau': '&#964;',
     'there4': '&#8756;',
     'theta': '&#952;',
     'thetasym': '&#977;',
     'thinsp': '&#8201;',
     'thorn': '&#254;',
     'tilde': '&#732;',
     'times': '&#215;',
     'trade': '&#8482;',
     'uArr': '&#8657;',
     'uacute': '&#250;',
     'uarr': '&#8593;',
     'ucirc': '&#251;',
     'ugrave': '&#249;',
     'uml': '&#168;',
     'upsih': '&#978;',
     'upsilon': '&#965;',
     'uuml': '&#252;',
     'weierp': '&#8472;',
     'xi': '&#958;',
     'yacute': '&#253;',
     'yen': '&#165;',
     'yuml': '&#255;',
     'zeta': '&#950;',
     'zwj': '&#8205;',
     'zwnj': '&#8204;'}
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -0,0 +1,347 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from lxml import etree, html
 from calibre import xml_replace_entities, force_unicode
 from calibre.constants import filesystem_encoding
 from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
 RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
 XHTML_NS     = 'http://www.w3.org/1999/xhtml'
 class NotHTML(Exception):
    def __init__(self, root_tag):
        Exception.__init__(self, 'Data is not HTML')
        self.root_tag = root_tag
 def barename(name):
    return name.rpartition('}')[-1]
 def namespace(name):
    if '}' in name:
        return name.split('}', 1)[0][1:]
    return ''
 def XHTML(name):
    return '{%s}%s' % (XHTML_NS, name)
 def xpath(elem, expr):
    return elem.xpath(expr, namespaces={'h':XHTML_NS})
 def XPath(expr):
    return etree.XPath(expr, namespaces={'h':XHTML_NS})
 META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
 def merge_multiple_html_heads_and_bodies(root, log=None):
    heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
    if not (len(heads) > 1 or len(bodies) > 1): return root
    for child in root: root.remove(child)
    head = root.makeelement(XHTML('head'))
    body = root.makeelement(XHTML('body'))
    for h in heads:
        for x in h:
            head.append(x)
    for b in bodies:
        for x in b:
            body.append(x)
    map(root.append, (head, body))
    if log is not None:
        log.warn('Merging multiple <head> and <body> sections')
    return root
 def _html5_parse(data):
    import html5lib
    data = html5lib.parse(data, treebuilder='lxml').getroot()
    html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and
            ns is not None)]
    if html_ns:
        # html5lib causes the XHTML namespace to not
        # be set as the default namespace
        nsmap = dict(data.nsmap)
        nsmap[None] = XHTML_NS
        for x in html_ns:
            nsmap.pop(x)
        nroot = etree.Element(data.tag, nsmap=nsmap,
                attrib=dict(data.attrib))
        nroot.text = data.text
        nroot.tail = data.tail
        for child in data:
            nroot.append(child)
        data = nroot
    return data
 def _html4_parse(data, prefer_soup=False):
    if prefer_soup:
        from calibre.utils.soupparser import fromstring
        data = fromstring(data)
    else:
        data = html.fromstring(data)
    data.attrib.pop('xmlns', None)
    for elem in data.iter(tag=etree.Comment):
        if elem.text:
            elem.text = elem.text.strip('-')
    data = etree.tostring(data, encoding=unicode)
    # Setting huge_tree=True causes crashes in windows with large files
    parser = etree.XMLParser(no_network=True)
    try:
        data = etree.fromstring(data, parser=parser)
    except etree.XMLSyntaxError:
        data = etree.fromstring(data, parser=RECOVER_PARSER)
    return data
 def clean_word_doc(data, log):
    prefixes = []
    for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
        prefixes.append(match.group(1))
    if prefixes:
        log.warn('Found microsoft markup, cleaning...')
        # Remove empty tags as they are not rendered by browsers
        # but can become renderable HTML tags like <p/> if the
        # document is parsed by an HTML parser
        pat = re.compile(
                r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
                re.DOTALL)
        data = pat.sub('', data)
        pat = re.compile(
                r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
        data = pat.sub('', data)
    return data
 def parse_html(data, log=None, decoder=None, preprocessor=None,
        filename='<string>', non_html_file_tags=frozenset()):
    if log is None:
        from calibre.utils.logging import default_log
        log = default_log
    filename = force_unicode(filename, enc=filesystem_encoding)
    if not isinstance(data, unicode):
        if decoder is not None:
            data = decoder(data)
        else:
            data = xml_to_unicode(data)[0]
    data = strip_encoding_declarations(data)
    if preprocessor is not None:
        data = preprocessor(data)
    # There could be null bytes in data if it had &#0; entities in it
    data = data.replace('\0', '')
    # Remove DOCTYPE declaration as it messes up parsing
    # In particular, it causes tostring to insert xmlns
    # declarations, which messes up the coercing logic
    idx = data.find('<html')
    if idx == -1:
        idx = data.find('<HTML')
    if idx > -1:
        pre = data[:idx]
        data = data[idx:]
        if '<!DOCTYPE' in pre: # Handle user defined entities
            user_entities = {}
            for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                val = match.group(2)
                if val.startswith('"') and val.endswith('"'):
                    val = val[1:-1]
                user_entities[match.group(1)] = val
            if user_entities:
                pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
                data = pat.sub(lambda m:user_entities[m.group(1)], data)
    data = clean_word_doc(data, log)
    # Setting huge_tree=True causes crashes in windows with large files
    parser = etree.XMLParser(no_network=True)
    # Try with more & more drastic measures to parse
    try:
        data = etree.fromstring(data, parser=parser)
    except etree.XMLSyntaxError:
        log.debug('Initial parse failed, using more'
                ' forgiving parsers')
        data = xml_replace_entities(data)
        try:
            data = etree.fromstring(data, parser=parser)
        except etree.XMLSyntaxError:
            log.debug('Parsing %s as HTML' % filename)
            try:
                data = _html5_parse(data)
            except:
                log.exception(
                    'HTML 5 parsing failed, falling back to older parsers')
                data = _html4_parse(data)
    if data.tag == 'HTML':
        # Lower case all tag and attribute names
        data.tag = data.tag.lower()
        for x in data.iterdescendants():
            try:
                x.tag = x.tag.lower()
                for key, val in list(x.attrib.iteritems()):
                    del x.attrib[key]
                    key = key.lower()
                    x.attrib[key] = val
            except:
                pass
    if barename(data.tag) != 'html':
        if barename(data.tag) in non_html_file_tags:
            raise NotHTML(data.tag)
        log.warn('File %r does not appear to be (X)HTML'%filename)
        nroot = etree.fromstring('<html></html>')
        has_body = False
        for child in list(data):
            if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
                has_body = True
                break
        parent = nroot
        if not has_body:
            log.warn('File %r appears to be a HTML fragment'%filename)
            nroot = etree.fromstring('<html><body/></html>')
            parent = nroot[0]
        for child in list(data.iter()):
            oparent = child.getparent()
            if oparent is not None:
                oparent.remove(child)
            parent.append(child)
        data = nroot
    # Force into the XHTML namespace
    if not namespace(data.tag):
        log.warn('Forcing', filename, 'into XHTML namespace')
        data.attrib['xmlns'] = XHTML_NS
        data = etree.tostring(data, encoding=unicode)
        try:
            data = etree.fromstring(data, parser=parser)
        except:
            data = data.replace(':=', '=').replace(':>', '>')
            data = data.replace('<http:/>', '')
            try:
                data = etree.fromstring(data, parser=parser)
            except etree.XMLSyntaxError:
                log.warn('Stripping comments from %s'%
                        filename)
                data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
                        data)
                data = data.replace(
                    "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
                    '')
                data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
                try:
                    data = etree.fromstring(data,
                            parser=RECOVER_PARSER)
                except etree.XMLSyntaxError:
                    log.warn('Stripping meta tags from %s'% filename)
                    data = re.sub(r'<meta\s+[^>]+?>', '', data)
                    data = etree.fromstring(data, parser=RECOVER_PARSER)
    elif namespace(data.tag) != XHTML_NS:
        # OEB_DOC_NS, but possibly others
        ns = namespace(data.tag)
        attrib = dict(data.attrib)
        nroot = etree.Element(XHTML('html'),
            nsmap={None: XHTML_NS}, attrib=attrib)
        for elem in data.iterdescendants():
            if isinstance(elem.tag, basestring) and \
                namespace(elem.tag) == ns:
                elem.tag = XHTML(barename(elem.tag))
        for elem in data:
            nroot.append(elem)
        data = nroot
    data = merge_multiple_html_heads_and_bodies(data, log)
    # Ensure has a <head/>
    head = xpath(data, '/h:html/h:head')
    head = head[0] if head else None
    if head is None:
        log.warn('File %s missing <head/> element' % filename)
        head = etree.Element(XHTML('head'))
        data.insert(0, head)
        title = etree.SubElement(head, XHTML('title'))
        title.text = _('Unknown')
    elif not xpath(data, '/h:html/h:head/h:title'):
        log.warn('File %s missing <title/> element' % filename)
        title = etree.SubElement(head, XHTML('title'))
        title.text = _('Unknown')
    # Remove any encoding-specifying <meta/> elements
    for meta in META_XP(data):
        meta.getparent().remove(meta)
    etree.SubElement(head, XHTML('meta'),
        attrib={'http-equiv': 'Content-Type',
                'content': '%s; charset=utf-8' % XHTML_NS})
    # Ensure has a <body/>
    if not xpath(data, '/h:html/h:body'):
        body = xpath(data, '//h:body')
        if body:
            body = body[0]
            body.getparent().remove(body)
            data.append(body)
        else:
            log.warn('File %s missing <body/> element' % filename)
            etree.SubElement(data, XHTML('body'))
    # Remove microsoft office markup
    r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
    for x in r:
        x.tag = XHTML('span')
    # Remove lang redefinition inserted by the amazing Microsoft Word!
    body = xpath(data, '/h:html/h:body')[0]
    for key in list(body.attrib.keys()):
        if key == 'lang' or key.endswith('}lang'):
            body.attrib.pop(key)
    def remove_elem(a):
        p = a.getparent()
        idx = p.index(a) -1
        p.remove(a)
        if a.tail:
            if idx <= 0:
                if p.text is None:
                    p.text = ''
                p.text += a.tail
            else:
                if p[idx].tail is None:
                    p[idx].tail = ''
                p[idx].tail += a.tail
    # Remove hyperlinks with no content as they cause rendering
    # artifacts in browser based renderers
    # Also remove empty <b>, <u> and <i> tags
    for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
        if a.get('id', None) is None and a.get('name', None) is None \
                and len(a) == 0 and not a.text:
            remove_elem(a)
    # Convert <br>s with content into paragraphs as ADE can't handle
    # them
    for br in xpath(data, '//h:br'):
        if len(br) > 0 or br.text:
            br.tag = XHTML('div')
    # Remove any stray text in the <head> section and format it nicely
    data.text = '\n  '
    head = xpath(data, '//h:head')
    if head:
        head = head[0]
        head.text = '\n    '
        head.tail = '\n  '
        for child in head:
            child.tail = '\n    '
        child.tail = '\n  '
    return data
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -19,16 +19,15 @@ from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
 from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
    PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
 from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
-    ENTITY_RE, MS_COVER_TYPE, iterlinks
+    MS_COVER_TYPE, iterlinks
 from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
                                    urlnormalize, BINARY_MIME, \
                                    OEBError, OEBBook, DirContainer
 from calibre.ebooks.oeb.writer import OEBWriter
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.utils.localization import get_lang
 from calibre.ptempfile import TemporaryDirectory
 from calibre.constants import __appname__, __version__
-from calibre import guess_type
+from calibre import guess_type, xml_replace_entities
 __all__ = ['OEBReader']
@ -107,8 +106,7 @@ class OEBReader(object):
        try:
            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
-            repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
+            data = xml_replace_entities(data, encoding=None)
            data = ENTITY_RE.sub(repl, data)
            try:
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
@ -371,8 +369,15 @@ class OEBReader(object):
            else :
                description = None
            index_image = xpath(child,
                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None
            node = toc.add(title, href, id=id, klass=klass,
-                    play_order=po, description=description, author=author)
+                    play_order=po, description=description, author=author,
                           toc_thumbnail=toc_thumbnail)
            self._toc_from_navpoint(item, node, child)
--- a/src/calibre/gui2/book_details.py
+++ b/src/calibre/gui2/book_details.py
@ -56,8 +56,11 @@ def render_html(mi, css, vertical, widget, all_fields=False): # {{{
        </body>
    <html>
    '''%(f, c, css)
    fm = getattr(mi, 'field_metadata', field_metadata)
    fl = dict(get_field_list(fm))
    show_comments = (all_fields or fl.get('comments', True))
    comments = u''
-    if mi.comments:
+    if mi.comments and show_comments:
        comments = comments_to_html(force_unicode(mi.comments))
    right_pane = u'<div id="comments" class="comments">%s</div>'%comments
--- a/src/calibre/gui2/custom_column_widgets.py
+++ b/src/calibre/gui2/custom_column_widgets.py
@ -429,7 +429,7 @@ def populate_metadata_page(layout, db, book_id, bulk=False, two_column=False, pa
    # The fields named here must be first in the widget list
    tweak_cols = tweaks['metadata_edit_custom_column_order']
    comments_in_tweak = 0
-    for key in tweak_cols:
+    for key in (tweak_cols or ()):
        # Add the key if it really exists in the database
        if key in cols_to_display:
            cols.append(key)
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@ -441,7 +441,7 @@ class Scheduler(QObject):
        self.news_menu.addAction(self.cac)
        self.news_menu.addSeparator()
        self.all_action = self.news_menu.addAction(
-                _('Download all scheduled new sources'),
+                _('Download all scheduled news sources'),
                self.download_all_scheduled)
        self.timer = QTimer(self)
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@ -758,11 +758,12 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        self.set_page_number(frac)
    def next_document(self):
-        if self.current_index < len(self.iterator.spine) - 1:
+        if (hasattr(self, 'current_index') and self.current_index <
                len(self.iterator.spine) - 1):
            self.load_path(self.iterator.spine[self.current_index+1])
    def previous_document(self):
-        if self.current_index > 0:
+        if hasattr(self, 'current_index') and self.current_index > 0:
            self.load_path(self.iterator.spine[self.current_index-1], pos=1.0)
    def keyPressEvent(self, event):
--- a/src/calibre/translations/af.po
+++ b/src/calibre/translations/af.po
--- a/src/calibre/translations/ar.po
+++ b/src/calibre/translations/ar.po
--- a/src/calibre/translations/ast.po
+++ b/src/calibre/translations/ast.po
--- a/src/calibre/translations/az.po
+++ b/src/calibre/translations/az.po
--- a/src/calibre/translations/bg.po
+++ b/src/calibre/translations/bg.po
--- a/src/calibre/translations/bn.po
+++ b/src/calibre/translations/bn.po
--- a/src/calibre/translations/br.po
+++ b/src/calibre/translations/br.po
--- a/src/calibre/translations/bs.po
+++ b/src/calibre/translations/bs.po
--- a/src/calibre/translations/ca.po
+++ b/src/calibre/translations/ca.po
--- a/src/calibre/translations/cs.po
+++ b/src/calibre/translations/cs.po
--- a/src/calibre/translations/da.po
+++ b/src/calibre/translations/da.po
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
--- a/src/calibre/translations/el.po
+++ b/src/calibre/translations/el.po
--- a/src/calibre/translations/en_AU.po
+++ b/src/calibre/translations/en_AU.po
--- a/src/calibre/translations/en_CA.po
+++ b/src/calibre/translations/en_CA.po
--- a/src/calibre/translations/en_GB.po
+++ b/src/calibre/translations/en_GB.po
--- a/src/calibre/translations/eo.po
+++ b/src/calibre/translations/eo.po
--- a/src/calibre/translations/es.po
+++ b/src/calibre/translations/es.po
--- a/src/calibre/translations/et.po
+++ b/src/calibre/translations/et.po
--- a/src/calibre/translations/eu.po
+++ b/src/calibre/translations/eu.po
--- a/src/calibre/translations/fa.po
+++ b/src/calibre/translations/fa.po
--- a/src/calibre/translations/fi.po
+++ b/src/calibre/translations/fi.po
--- a/src/calibre/translations/fo.po
+++ b/src/calibre/translations/fo.po
--- a/src/calibre/translations/fr.po
+++ b/src/calibre/translations/fr.po
--- a/src/calibre/translations/gl.po
+++ b/src/calibre/translations/gl.po
--- a/src/calibre/translations/gu.po
+++ b/src/calibre/translations/gu.po
--- a/src/calibre/translations/he.po
+++ b/src/calibre/translations/he.po
--- a/src/calibre/translations/hi.po
+++ b/src/calibre/translations/hi.po
--- a/src/calibre/translations/hr.po
+++ b/src/calibre/translations/hr.po
--- a/src/calibre/translations/hu.po
+++ b/src/calibre/translations/hu.po
--- a/src/calibre/translations/id.po
+++ b/src/calibre/translations/id.po
--- a/src/calibre/translations/it.po
+++ b/src/calibre/translations/it.po
--- a/src/calibre/translations/ja.po
+++ b/src/calibre/translations/ja.po
--- a/src/calibre/translations/kn.po
+++ b/src/calibre/translations/kn.po
--- a/src/calibre/translations/ko.po
+++ b/src/calibre/translations/ko.po
--- a/src/calibre/translations/ku.po
+++ b/src/calibre/translations/ku.po
--- a/src/calibre/translations/lt.po
+++ b/src/calibre/translations/lt.po
--- a/src/calibre/translations/ltg.po
+++ b/src/calibre/translations/ltg.po
--- a/src/calibre/translations/lv.po
+++ b/src/calibre/translations/lv.po
--- a/src/calibre/translations/mk.po
+++ b/src/calibre/translations/mk.po
--- a/src/calibre/translations/ml.po
+++ b/src/calibre/translations/ml.po
--- a/src/calibre/translations/mr.po
+++ b/src/calibre/translations/mr.po
--- a/src/calibre/translations/ms.po
+++ b/src/calibre/translations/ms.po
--- a/src/calibre/translations/nb.po
+++ b/src/calibre/translations/nb.po
--- a/src/calibre/translations/nds.po
+++ b/src/calibre/translations/nds.po
--- a/src/calibre/translations/nl.po
+++ b/src/calibre/translations/nl.po
--- a/src/calibre/translations/oc.po
+++ b/src/calibre/translations/oc.po
--- a/src/calibre/translations/pa.po
+++ b/src/calibre/translations/pa.po
--- a/src/calibre/translations/pl.po
+++ b/src/calibre/translations/pl.po
--- a/src/calibre/translations/pt.po
+++ b/src/calibre/translations/pt.po
--- a/src/calibre/translations/pt_BR.po
+++ b/src/calibre/translations/pt_BR.po
--- a/src/calibre/translations/ro.po
+++ b/src/calibre/translations/ro.po
--- a/src/calibre/translations/ru.po
+++ b/src/calibre/translations/ru.po
--- a/src/calibre/translations/sc.po
+++ b/src/calibre/translations/sc.po
--- a/src/calibre/translations/si.po
+++ b/src/calibre/translations/si.po
--- a/src/calibre/translations/sk.po
+++ b/src/calibre/translations/sk.po
--- a/Show More
+++ b/Show More