Port use of renderContents and BeautifulStoneSoup

2025-07-09 03:04:10 -04:00 · 2019-03-23 13:31:06 +05:30 · 2019-03-23 13:31:06 +05:30 · 256c7563b6
commit 256c7563b6
parent c89b656df4
27 changed files with 116 additions and 187 deletions
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@ -28,10 +28,10 @@ class Adventure_zone(BasicNewsRecipe):
    def skip_ad_pages(self, soup):
        skip_tag = soup.body.find(attrs={'class':'subject'})
        skip_tag = skip_tag.findAll(name='a', href=True)
-        title = soup.title.renderContents().lower()
+        title = soup.title.renderContents().decode('utf-8').lower()
        if self._is_linked_text(title):
            for r in skip_tag:
-                word = r.renderContents()
+                word = r.renderContents().decode('utf-8')
                if not word:
                    continue
                word = word.lower()
--- a/recipes/berlin_policy_journal.recipe
+++ b/recipes/berlin_policy_journal.recipe
@ -104,7 +104,7 @@ class BerlinPolicyJournal(BasicNewsRecipe):
        div = soup.find('div', {'class': 'meta-info'})
        authors = ''
        for entry in div.findAll('span', {'class': 'entry-author'}):
-            authors = authors + entry.a.span.renderContents().strip() + ', '
-        date = div.find('time').renderContents().strip()
+            authors = authors + entry.a.span.renderContents().decode('utf-8').strip() + ', '
+        date = div.find('time').renderContents().decode('utf-8').strip()
        div.replaceWith('<div>' + date + ' | ' + authors[:-2] + '<br/></div>')
        return soup
--- a/recipes/bild_de.recipe
+++ b/recipes/bild_de.recipe
@ -83,5 +83,5 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe):
                br.replaceWith(' ')
        # remove all links
        for a in soup.findAll('a'):
-            a.replaceWith(a.renderContents())
+            a.replaceWith(a.renderContents().decode('utf-8'))
        return soup
--- a/recipes/calgary_herald.recipe
+++ b/recipes/calgary_herald.recipe
@ -8,7 +8,7 @@ www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
+from calibre.ebooks.BeautifulSoup import Tag


 def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/edmonton_journal.recipe
+++ b/recipes/edmonton_journal.recipe
@ -8,7 +8,7 @@ www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
+from calibre.ebooks.BeautifulSoup import Tag


 def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/ekathemerini.recipe
+++ b/recipes/ekathemerini.recipe
@ -1,5 +1,6 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from lxml import etree


 class Ekathimerini(BasicNewsRecipe):
@ -41,12 +42,10 @@ class Ekathimerini(BasicNewsRecipe):

    def parse_index(self):
        idx_contents = self.browser.open(self.rss_url).read()
-        idx = BeautifulStoneSoup(
-            idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES)
+        idx = etree.fromstring(idx_contents)

-        cats = list(set([self.tag_to_string(subcat)
-                         for subcat in idx.findAll('subcat')]))
-        cats.sort()
+        cats = sorted({self.tag_to_string(subcat)
+                         for subcat in idx.xpath('//*[local-name()="subcat"]')})

        feeds = [(u'News', list(self.find_articles(idx, u'')))]

--- a/recipes/fokkeensukke.recipe
+++ b/recipes/fokkeensukke.recipe
@ -52,29 +52,29 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe):
            # If there's only one, there is just a link with the dayname.
            # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
            # In that case we're interested in the last two.
-            if links[i].renderContents() in dayNames:
+            if links[i].renderContents().decode('utf-8') in dayNames:
                # If the link is not in daynames, we processed it already, but if it is, let's see
                # if the next one has '1' as content
-                if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1'):
+                if (i + 1 <= maxIndex) and (links[i + 1].renderContents().decode('utf-8') == '1'):
                    # Got you! Add it to the list
-                    article = {'title': links[i].renderContents(
+                    article = {'title': links[i].renderContents().decode('utf-8'
                        )+ ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''}
                    articles.append(article)
                    # If there is a '1', there should be a '2' as well, but
                    # better save than sorry
-                    if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2'):
+                    if (i + 2 <= maxIndex) and (links[i + 2].renderContents().decode('utf-8') == '2'):
                        # Got you! Add it to the list
                        article = {'title': links[i].renderContents(
-                        ) + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
+                        ).decode('utf-8') + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
                        articles.append(article)
                else:
                    # There is only one cartoon for this day. Add it to the
                    # list.
                    article = {'title': links[i].renderContents(
-                    ), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
+                    ).decode('utf-8'), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
                    articles.append(article)
        # Might as well use the weeknumber as title
-        week = index.find('span', attrs={'class': 'week'}).renderContents()
+        week = index.find('span', attrs={'class': 'week'}).renderContents().decode('utf-8')

        return [[week, articles]]

--- a/recipes/gildia_pl.recipe
+++ b/recipes/gildia_pl.recipe
@ -46,7 +46,7 @@ class Gildia(BasicNewsRecipe):

        words = ('recenzj', 'zapowied', 'fragmen',
                 'relacj', 'wywiad', 'nominacj')
-        document_title = soup.title.renderContents().lower()
+        document_title = soup.title.renderContents().decode('utf-8').lower()
        for word in words:
            if word in document_title:
                for link in content.findAll(name='a'):
@ -57,7 +57,7 @@ class Gildia(BasicNewsRecipe):
                return self.index_to_soup(tag['href'], raw=True)

    def preprocess_html(self, soup):
-        title = soup.title.renderContents().lower()
+        title = soup.title.renderContents().decode('utf-8').lower()
        for a in soup('a', href=True):
            if not a['href'].startswith('http'):
                if '/gry/' in a['href']:
--- a/recipes/handelsblatt.recipe
+++ b/recipes/handelsblatt.recipe
@ -129,7 +129,7 @@ class Handelsblatt(BasicNewsRecipe):
    def postprocess_html(self, soup, first_fetch):
        # convert lists of author(s) and date(s) into simple text
        for cap in soup.findAll('div', {'class': re.compile('vhb-article-caption')}):
-            cap.replaceWith(cap.renderContents().strip() + ' ')
+            cap.replaceWith(cap.renderContents().decode('utf-8').strip() + ' ')
        for row in soup.findAll('div', {'class': 'vhb-article-author-row'}):
            for ul in row.findAll('ul'):
                entry = ''
@ -141,7 +141,7 @@ class Handelsblatt(BasicNewsRecipe):
        # remove all local hyperlinks
        for a in soup.findAll('a', {'href': True}):
            if a['href'] and a['href'][0] in ['/', '#']:
-                a.replaceWith(a.renderContents())
+                a.replaceWith(a.renderContents().decode('utf-8'))
        # make sure that all figure captions (including the source) are shown
        # without linebreaks by using the alternative text given within <img/>
        # instead of the original text (which is oddly formatted)
--- a/recipes/joop.recipe
+++ b/recipes/joop.recipe
@ -63,7 +63,7 @@ class JoopRecipe(BasicNewsRecipe):
        for section in sections:
            articles = []
            h2 = div.find(lambda tag: tag.name ==
-                          'h2' and tag.renderContents() == section)
+                          'h2' and tag.renderContents().decode('utf-8') == section)
            if h2:
                ul = h2.findNextSibling('ul', 'linklist')
                if ul:
--- a/recipes/mediapart.recipe
+++ b/recipes/mediapart.recipe
@ -65,14 +65,14 @@ class Mediapart(BasicNewsRecipe):

                # print "found fil ",title
                article_type = article.find('a', {'href': re.compile(
-                    r'.*\/type-darticles\/.*')}).renderContents()
+                    r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
                # print "kind: ",article_type

                for s in title('span'):
-                    s.replaceWith(s.renderContents() + "\n")
+                    s.replaceWith(s.renderContents().decode('utf-8') + "\n")
                url = title.find('a', href=True)['href']

-                # article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
+                # article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8'))
                # print("################################# 9")
                # print(article_date)

--- a/recipes/montreal_gazette.recipe
+++ b/recipes/montreal_gazette.recipe
@ -8,7 +8,7 @@ www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
+from calibre.ebooks.BeautifulSoup import Tag


 def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/ncrnext.recipe
+++ b/recipes/ncrnext.recipe
@ -76,7 +76,7 @@ class NrcNextRecipe(BasicNewsRecipe):
                    # In this feed/page articles can be written by more than one author.
                    # It is nice to see their names in the titles.
                    flag = post.find('h2', attrs={'class': 'vlag'})
-                    author = flag.contents[0].renderContents()
+                    author = flag.contents[0].renderContents().decode('utf-8')
                    completeTitle = u''.join([author, u': ', title])
                else:
                    completeTitle = title
--- a/recipes/ottawa_citizen.recipe
+++ b/recipes/ottawa_citizen.recipe
@ -8,7 +8,7 @@ www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
+from calibre.ebooks.BeautifulSoup import Tag


 def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/regina_leader_post.recipe
+++ b/recipes/regina_leader_post.recipe
@ -9,7 +9,6 @@ www.canada.com

 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup


 class CanWestPaper(BasicNewsRecipe):
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/saskatoon_star_phoenix.recipe
+++ b/recipes/saskatoon_star_phoenix.recipe
@ -9,7 +9,6 @@ www.canada.com

 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup


 class CanWestPaper(BasicNewsRecipe):
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/vancouver_province.recipe
+++ b/recipes/vancouver_province.recipe
@ -8,7 +8,7 @@ www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
+from calibre.ebooks.BeautifulSoup import Tag


 def new_tag(soup, name, attrs=()):
@ -196,14 +196,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/vancouver_sun.recipe
+++ b/recipes/vancouver_sun.recipe
@ -8,7 +8,7 @@ www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
+from calibre.ebooks.BeautifulSoup import Tag


 def new_tag(soup, name, attrs=()):
@ -184,14 +184,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/vic_times.recipe
+++ b/recipes/vic_times.recipe
@ -9,7 +9,7 @@ www.canada.com
 import re
 from calibre.web.feeds.news import BasicNewsRecipe

-from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
+from calibre.ebooks.BeautifulSoup import Tag


 def new_tag(soup, name, attrs=()):
@ -147,14 +147,6 @@ class TimesColonist(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/recipes/windsor_star.recipe
+++ b/recipes/windsor_star.recipe
@ -9,7 +9,6 @@ www.canada.com

 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup


 class CanWestPaper(BasicNewsRecipe):
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
        return fixed

    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(
-                description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&", "&", massaged)
-            return self.fixChars(massaged)
-        else:
        return description

    def populate_article_metadata(self, article, soup, first):
--- a/src/calibre/ebooks/BeautifulSoup.py
+++ b/src/calibre/ebooks/BeautifulSoup.py
@ -4,6 +4,7 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

+import bs4
 from bs4 import (  # noqa
    CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag,
    __version__
@ -27,3 +28,7 @@ def parse_html(markup):

 def BeautifulSoup(markup='', *a, **kw):
    return parse_html(markup)
+
+
+def BeautifulStoneSoup(markup='', *a, **kw):
+    return bs4.BeautifulSoup(markup, 'xml')
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@ -39,7 +39,7 @@ def _metadata_from_table(soup, searchfor):
    td = td.parent
    # there appears to be multiple ways of structuring the metadata
    # on the home page. cue some nasty special-case hacks...
-    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I):
+    if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I):
        meta = _detag(td.findNextSibling('td'))
        return re.sub('^:', '', meta).strip()
    else:
@ -52,7 +52,7 @@ def _metadata_from_span(soup, searchfor):
    if span is None:
        return None
    # this metadata might need some cleaning up still :/
-    return _detag(span.renderContents(None).strip())
+    return _detag(span.decode_contents().strip())


 def _get_authors(soup):
--- a/src/calibre/ebooks/lrf/lrs/convert_from.py
+++ b/src/calibre/ebooks/lrf/lrs/convert_from.py
@ -5,35 +5,31 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 Compile a LRS file into a LRF file.
 '''

-import sys, os, logging
+import logging
+import os
+import sys

 from calibre import setup_cli_handlers
-from calibre.utils.config import OptionParser
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
-                                           CData, Tag
-from calibre.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
-            BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
-            Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
-            Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
-            DropCaps, Footer, RuledLine
+from calibre.ebooks.BeautifulSoup import (
+    BeautifulStoneSoup, CData, NavigableString, Tag
+)
 from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.lrf.pylrs.pylrs import (
+    CR, BlockStyle, Bold, Book, BookSetting, Canvas, CharButton, DropCaps, EmpLine,
+    Font, Footer, Header, Image, ImageBlock, ImageStream, Italic, JumpButton, Page,
+    PageStyle, Paragraph, Plot, RuledLine, Span, StyleDefault, Sub, Sup, TextBlock,
+    TextStyle
+)
+from calibre.utils.config import OptionParser
 from polyglot.builtins import string_or_bytes


 class LrsParser(object):

-    SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space',
-                         'PutObj', 'RuledLine',
-                         'Plot', 'SetDefault', 'BookSetting', 'RegistFont',
-                         'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo',
-                         'ImageStream', 'Image']]
-
    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
-        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
-                       convertEntities=BeautifulStoneSoup.XML_ENTITIES,
-                       selfClosingTags=self.SELF_CLOSING_TAGS)
+        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj
--- a/src/calibre/ebooks/metadata/epub.py
+++ b/src/calibre/ebooks/metadata/epub.py
@ -1,22 +1,29 @@
 #!/usr/bin/env python2
-from __future__ import with_statement
-from __future__ import print_function
+from __future__ import print_function, with_statement
+
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

 '''Read meta information from epub files'''

-import io, os, re, posixpath
+
+import io
+import os
+import posixpath
+import re
 from contextlib import closing

-from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
-from calibre.utils.localunzip import LocalZipFile
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
-from calibre.ebooks.metadata.opf import get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
-from calibre.ebooks.metadata.opf2 import OPF
-from calibre.ptempfile import TemporaryDirectory
+from lxml import etree
+
 from calibre import CurrentDir, walk
 from calibre.constants import isosx
+from calibre.ebooks.metadata.opf import (
+    get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
+)
+from calibre.ebooks.metadata.opf2 import OPF
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.localunzip import LocalZipFile
+from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace


 class EPubException(Exception):
@ -36,20 +43,17 @@ class Container(dict):
    def __init__(self, stream=None):
        if not stream:
            return
-        soup = BeautifulStoneSoup(stream.read())
-        container = soup.find(name=re.compile(r'container$', re.I))
-        if not container:
-            raise OCFException("<container> element missing")
+        container = etree.fromstring(stream.read())
        if container.get('version', None) != '1.0':
            raise EPubException("unsupported version of OCF")
-        rootfiles = container.find(re.compile(r'rootfiles$', re.I))
+        rootfiles = container.xpath('./*[local-name()="rootfiles"]')
        if not rootfiles:
            raise EPubException("<rootfiles/> element missing")
-        for rootfile in rootfiles.findAll(re.compile(r'rootfile$', re.I)):
-            try:
-                self[rootfile['media-type']] = rootfile['full-path']
-            except KeyError:
+        for rootfile in rootfiles[0].xpath('./*[local-name()="rootfile"]'):
+            mt, fp = rootfile.get('media-type'), rootfile.get('full-path')
+            if not mt or not fp:
                raise EPubException("<rootfile/> element malformed")
+            self[mt] = fp


 class OCF(object):
--- a/src/calibre/ebooks/oeb/transforms/jacket.py
+++ b/src/calibre/ebooks/oeb/transforms/jacket.py
@ -340,8 +340,7 @@ def render_jacket(mi, output_profile,
            if hr_tag is not None:
                hr_tag.extract()

-        return strip_encoding_declarations(
-                soup.renderContents('utf-8').decode('utf-8'))
+        return strip_encoding_declarations(soup.decode_contents())

    from calibre.ebooks.oeb.base import RECOVER_PARSER

--- a/src/calibre/library/catalogs/epub_mobi_builder.py
+++ b/src/calibre/library/catalogs/epub_mobi_builder.py
@ -9,7 +9,7 @@ from copy import deepcopy
 from xml.sax.saxutils import escape

 from calibre import (
-    prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode)
+    prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode, xml_replace_entities)
 from calibre.constants import isosx, cache_dir
 from calibre.customize.conversion import DummyReporter
 from calibre.customize.ui import output_profiles
@ -29,6 +29,9 @@ from calibre.utils.localization import get_lang, lang_as_iso639_1
 from polyglot.builtins import unicode_type


+NBSP = u'\u00a0'
+
+
 class Formatter(TemplateFormatter):

    def get_value(self, key, args, kwargs):
@ -112,7 +115,7 @@ class CatalogBuilder(object):
        if self.generate_for_kindle_mobi:
            return '&#x25b7;'
        else:
-            return '&nbsp;'
+            return NBSP

    def __init__(self, db, _opts, plugin,
                    report_progress=DummyReporter(),
@ -1326,7 +1329,7 @@ class CatalogBuilder(object):
        """
        # Kindle TOC descriptions won't render certain characters
        # Fix up
-        massaged = unicode_type(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+        massaged = xml_replace_entities(unicode_type(description))

        # Replace '&' with '&#38;'
        massaged = re.sub("&", "&#38;", massaged)
@ -1354,7 +1357,7 @@ class CatalogBuilder(object):
        if self.opts.fmt == 'mobi':
            codeTag = soup.new_tag("code")
            if prefix_char is None:
-                codeTag.insert(0, NavigableString('&nbsp;'))
+                codeTag.insert(0, NavigableString(NBSP))
            else:
                codeTag.insert(0, NavigableString(prefix_char))
            return codeTag
@ -1362,7 +1365,7 @@ class CatalogBuilder(object):
            spanTag = soup.new_tag("span")
            spanTag['class'] = "prefix"
            if prefix_char is None:
-                prefix_char = "&nbsp;"
+                prefix_char = NBSP
            spanTag.insert(0, NavigableString(prefix_char))
            return spanTag

@ -2711,7 +2714,7 @@ class CatalogBuilder(object):
                if i < len(book['genres']) - 1:
                    genresTag.insert(gtc, NavigableString(' &middot; '))
                    gtc += 1
-            genres = genresTag.renderContents()
+            genres = genresTag.decode_contents()

        # Formats
        formats = []
@ -2793,7 +2796,7 @@ class CatalogBuilder(object):
        if publisher == ' ':
            publisherTag = body.find('td', attrs={'class': 'publisher'})
            if publisherTag:
-                publisherTag.contents[0].replaceWith('&nbsp;')
+                publisherTag.contents[0].replaceWith(NBSP)

        if not genres:
            genresTag = body.find('p', attrs={'class': 'genres'})
@ -2808,12 +2811,12 @@ class CatalogBuilder(object):
        if note_content == '':
            tdTag = body.find('td', attrs={'class': 'notes'})
            if tdTag:
-                tdTag.contents[0].replaceWith('&nbsp;')
+                tdTag.contents[0].replaceWith(NBSP)

        emptyTags = body.findAll('td', attrs={'class': 'empty'})
        for mt in emptyTags:
            newEmptyTag = soup.new_tag('td')
-            newEmptyTag.insert(0, '\xa0')
+            newEmptyTag.insert(0, NBSP)
            mt.replaceWith(newEmptyTag)

        return soup
@ -2974,7 +2977,7 @@ class CatalogBuilder(object):
            <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata" version="2005-1" xml:lang="en">
            </ncx>
        '''
-        soup = BeautifulStoneSoup(header, selfClosingTags=['content', 'calibre:meta-img'])
+        soup = BeautifulStoneSoup(header)

        ncx = soup.find('ncx')
        navMapTag = soup.new_tag('navMap')
@ -4033,7 +4036,7 @@ class CatalogBuilder(object):
            </package>
            '''.replace('LANG', lang)
        # Add the supplied metadata tags
-        soup = BeautifulStoneSoup(header, selfClosingTags=['item', 'itemref', 'meta', 'reference'])
+        soup = BeautifulStoneSoup(header)
        metadata = soup.find('metadata')
        mtc = 0

@ -4171,8 +4174,11 @@ class CatalogBuilder(object):
            guide.insert(0, referenceTag)

        # Write the OPF file
-        outfile = open("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'w')
-        outfile.write(soup.prettify())
+        output = soup.prettify(encoding='utf-8')
+        if isinstance(output, unicode_type):
+            output = output.encode('utf-8')
+        with lopen("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
+            outfile.write(output)

    def generate_rating_string(self, book):
        """ Generate rating string for Descriptions.
@ -4657,7 +4663,7 @@ class CatalogBuilder(object):
            elem.extract()

        # Reconstruct comments w/o <div>s
-        comments = soup.renderContents(None)
+        comments = soup.decode_contents()

        # Convert \n\n to <p>s
        if re.search('\n\n', comments):
@ -4669,7 +4675,7 @@ class CatalogBuilder(object):
                pTag.insert(0, p)
                soup.insert(tsc, pTag)
                tsc += 1
-            comments = soup.renderContents(None)
+            comments = soup.decode_contents()

        # Convert solo returns to <br />
        comments = re.sub('[\r\n]', '<br />', comments)
@ -4726,7 +4732,7 @@ class CatalogBuilder(object):
            result.insert(rtc, elem)
            rtc += 1

-        return result.renderContents(encoding=None)
+        return result.decode_contents()

    def merge_comments(self, record):
        """ Merge comments with custom column content.
@ -4954,6 +4960,9 @@ class CatalogBuilder(object):
        """

        self.update_progress_full_step(_("Saving NCX"))
+        ncx = self.ncx_soup.prettify(encoding='utf-8')
+        if isinstance(ncx, unicode_type):
+            ncx = ncx.encode('utf-8')

-        outfile = open("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'w')
-        outfile.write(self.ncx_soup.prettify())
+        with lopen("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
+            outfile.write(ncx)
--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@ -131,7 +131,7 @@ def comments_to_html(comments):
    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode_type(t)))

-    return result.renderContents(encoding=None)
+    return result.decode_contents()


 def markdown(val):