Port use of renderContents and BeautifulStoneSoup

This commit is contained in:
Kovid Goyal 2019-03-23 13:31:06 +05:30
parent c89b656df4
commit 256c7563b6
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
27 changed files with 116 additions and 187 deletions

View File

@ -28,10 +28,10 @@ class Adventure_zone(BasicNewsRecipe):
def skip_ad_pages(self, soup):
skip_tag = soup.body.find(attrs={'class':'subject'})
skip_tag = skip_tag.findAll(name='a', href=True)
title = soup.title.renderContents().lower()
title = soup.title.renderContents().decode('utf-8').lower()
if self._is_linked_text(title):
for r in skip_tag:
word = r.renderContents()
word = r.renderContents().decode('utf-8')
if not word:
continue
word = word.lower()

View File

@ -104,7 +104,7 @@ class BerlinPolicyJournal(BasicNewsRecipe):
div = soup.find('div', {'class': 'meta-info'})
authors = ''
for entry in div.findAll('span', {'class': 'entry-author'}):
authors = authors + entry.a.span.renderContents().strip() + ', '
date = div.find('time').renderContents().strip()
authors = authors + entry.a.span.renderContents().decode('utf-8').strip() + ', '
date = div.find('time').renderContents().decode('utf-8').strip()
div.replaceWith('<div>' + date + ' | ' + authors[:-2] + '<br/></div>')
return soup

View File

@ -83,5 +83,5 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe):
br.replaceWith(' ')
# remove all links
for a in soup.findAll('a'):
a.replaceWith(a.renderContents())
a.replaceWith(a.renderContents().decode('utf-8'))
return soup

View File

@ -8,7 +8,7 @@ www.canada.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -8,7 +8,7 @@ www.canada.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -1,5 +1,6 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from lxml import etree
class Ekathimerini(BasicNewsRecipe):
@ -41,12 +42,10 @@ class Ekathimerini(BasicNewsRecipe):
def parse_index(self):
idx_contents = self.browser.open(self.rss_url).read()
idx = BeautifulStoneSoup(
idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES)
idx = etree.fromstring(idx_contents)
cats = list(set([self.tag_to_string(subcat)
for subcat in idx.findAll('subcat')]))
cats.sort()
cats = sorted({self.tag_to_string(subcat)
for subcat in idx.xpath('//*[local-name()="subcat"]')})
feeds = [(u'News', list(self.find_articles(idx, u'')))]

View File

@ -52,29 +52,29 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe):
# If there's only one, there is just a link with the dayname.
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
# In that case we're interested in the last two.
if links[i].renderContents() in dayNames:
if links[i].renderContents().decode('utf-8') in dayNames:
# If the link is not in daynames, we processed it already, but if it is, let's see
# if the next one has '1' as content
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1'):
if (i + 1 <= maxIndex) and (links[i + 1].renderContents().decode('utf-8') == '1'):
# Got you! Add it to the list
article = {'title': links[i].renderContents(
article = {'title': links[i].renderContents().decode('utf-8'
)+ ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''}
articles.append(article)
# If there is a '1', there should be a '2' as well, but
# better save than sorry
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2'):
if (i + 2 <= maxIndex) and (links[i + 2].renderContents().decode('utf-8') == '2'):
# Got you! Add it to the list
article = {'title': links[i].renderContents(
) + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
).decode('utf-8') + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
articles.append(article)
else:
# There is only one cartoon for this day. Add it to the
# list.
article = {'title': links[i].renderContents(
), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
).decode('utf-8'), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
articles.append(article)
# Might as well use the weeknumber as title
week = index.find('span', attrs={'class': 'week'}).renderContents()
week = index.find('span', attrs={'class': 'week'}).renderContents().decode('utf-8')
return [[week, articles]]

View File

@ -46,7 +46,7 @@ class Gildia(BasicNewsRecipe):
words = ('recenzj', 'zapowied', 'fragmen',
'relacj', 'wywiad', 'nominacj')
document_title = soup.title.renderContents().lower()
document_title = soup.title.renderContents().decode('utf-8').lower()
for word in words:
if word in document_title:
for link in content.findAll(name='a'):
@ -57,7 +57,7 @@ class Gildia(BasicNewsRecipe):
return self.index_to_soup(tag['href'], raw=True)
def preprocess_html(self, soup):
title = soup.title.renderContents().lower()
title = soup.title.renderContents().decode('utf-8').lower()
for a in soup('a', href=True):
if not a['href'].startswith('http'):
if '/gry/' in a['href']:

View File

@ -129,7 +129,7 @@ class Handelsblatt(BasicNewsRecipe):
def postprocess_html(self, soup, first_fetch):
# convert lists of author(s) and date(s) into simple text
for cap in soup.findAll('div', {'class': re.compile('vhb-article-caption')}):
cap.replaceWith(cap.renderContents().strip() + ' ')
cap.replaceWith(cap.renderContents().decode('utf-8').strip() + ' ')
for row in soup.findAll('div', {'class': 'vhb-article-author-row'}):
for ul in row.findAll('ul'):
entry = ''
@ -141,7 +141,7 @@ class Handelsblatt(BasicNewsRecipe):
# remove all local hyperlinks
for a in soup.findAll('a', {'href': True}):
if a['href'] and a['href'][0] in ['/', '#']:
a.replaceWith(a.renderContents())
a.replaceWith(a.renderContents().decode('utf-8'))
# make sure that all figure captions (including the source) are shown
# without linebreaks by using the alternative text given within <img/>
# instead of the original text (which is oddly formatted)

View File

@ -63,7 +63,7 @@ class JoopRecipe(BasicNewsRecipe):
for section in sections:
articles = []
h2 = div.find(lambda tag: tag.name ==
'h2' and tag.renderContents() == section)
'h2' and tag.renderContents().decode('utf-8') == section)
if h2:
ul = h2.findNextSibling('ul', 'linklist')
if ul:

View File

@ -65,14 +65,14 @@ class Mediapart(BasicNewsRecipe):
# print "found fil ",title
article_type = article.find('a', {'href': re.compile(
r'.*\/type-darticles\/.*')}).renderContents()
r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
# print "kind: ",article_type
for s in title('span'):
s.replaceWith(s.renderContents() + "\n")
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
url = title.find('a', href=True)['href']
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8'))
# print("################################# 9")
# print(article_date)

View File

@ -8,7 +8,7 @@ www.canada.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -76,7 +76,7 @@ class NrcNextRecipe(BasicNewsRecipe):
# In this feed/page articles can be written by more than one author.
# It is nice to see their names in the titles.
flag = post.find('h2', attrs={'class': 'vlag'})
author = flag.contents[0].renderContents()
author = flag.contents[0].renderContents().decode('utf-8')
completeTitle = u''.join([author, u': ', title])
else:
completeTitle = title

View File

@ -8,7 +8,7 @@ www.canada.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -9,7 +9,6 @@ www.canada.com
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -9,7 +9,6 @@ www.canada.com
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -8,7 +8,7 @@ www.canada.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
@ -196,14 +196,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -8,7 +8,7 @@ www.canada.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
@ -184,14 +184,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -9,7 +9,7 @@ www.canada.com
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()):
@ -147,14 +147,6 @@ class TimesColonist(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -9,7 +9,6 @@ www.canada.com
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):

View File

@ -4,6 +4,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import bs4
from bs4 import ( # noqa
CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag,
__version__
@ -27,3 +28,7 @@ def parse_html(markup):
def BeautifulSoup(markup='', *a, **kw):
return parse_html(markup)
def BeautifulStoneSoup(markup='', *a, **kw):
return bs4.BeautifulSoup(markup, 'xml')

View File

@ -39,7 +39,7 @@ def _metadata_from_table(soup, searchfor):
td = td.parent
# there appears to be multiple ways of structuring the metadata
# on the home page. cue some nasty special-case hacks...
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I):
if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I):
meta = _detag(td.findNextSibling('td'))
return re.sub('^:', '', meta).strip()
else:
@ -52,7 +52,7 @@ def _metadata_from_span(soup, searchfor):
if span is None:
return None
# this metadata might need some cleaning up still :/
return _detag(span.renderContents(None).strip())
return _detag(span.decode_contents().strip())
def _get_authors(soup):

View File

@ -5,35 +5,31 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Compile a LRS file into a LRF file.
'''
import sys, os, logging
import logging
import os
import sys
from calibre import setup_cli_handlers
from calibre.utils.config import OptionParser
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
CData, Tag
from calibre.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
DropCaps, Footer, RuledLine
from calibre.ebooks.BeautifulSoup import (
BeautifulStoneSoup, CData, NavigableString, Tag
)
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.lrf.pylrs.pylrs import (
CR, BlockStyle, Bold, Book, BookSetting, Canvas, CharButton, DropCaps, EmpLine,
Font, Footer, Header, Image, ImageBlock, ImageStream, Italic, JumpButton, Page,
PageStyle, Paragraph, Plot, RuledLine, Span, StyleDefault, Sub, Sup, TextBlock,
TextStyle
)
from calibre.utils.config import OptionParser
from polyglot.builtins import string_or_bytes
class LrsParser(object):
SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space',
'PutObj', 'RuledLine',
'Plot', 'SetDefault', 'BookSetting', 'RegistFont',
'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo',
'ImageStream', 'Image']]
def __init__(self, stream, logger):
self.logger = logger
src = stream.read()
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
convertEntities=BeautifulStoneSoup.XML_ENTITIES,
selfClosingTags=self.SELF_CLOSING_TAGS)
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
self.objects = {}
for obj in self.soup.findAll(objid=True):
self.objects[obj['objid']] = obj

View File

@ -1,22 +1,29 @@
#!/usr/bin/env python2
from __future__ import with_statement
from __future__ import print_function
from __future__ import print_function, with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from epub files'''
import io, os, re, posixpath
import io
import os
import posixpath
import re
from contextlib import closing
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
from calibre.utils.localunzip import LocalZipFile
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata.opf import get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory
from lxml import etree
from calibre import CurrentDir, walk
from calibre.constants import isosx
from calibre.ebooks.metadata.opf import (
get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
)
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.localunzip import LocalZipFile
from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace
class EPubException(Exception):
@ -36,20 +43,17 @@ class Container(dict):
def __init__(self, stream=None):
if not stream:
return
soup = BeautifulStoneSoup(stream.read())
container = soup.find(name=re.compile(r'container$', re.I))
if not container:
raise OCFException("<container> element missing")
container = etree.fromstring(stream.read())
if container.get('version', None) != '1.0':
raise EPubException("unsupported version of OCF")
rootfiles = container.find(re.compile(r'rootfiles$', re.I))
rootfiles = container.xpath('./*[local-name()="rootfiles"]')
if not rootfiles:
raise EPubException("<rootfiles/> element missing")
for rootfile in rootfiles.findAll(re.compile(r'rootfile$', re.I)):
try:
self[rootfile['media-type']] = rootfile['full-path']
except KeyError:
for rootfile in rootfiles[0].xpath('./*[local-name()="rootfile"]'):
mt, fp = rootfile.get('media-type'), rootfile.get('full-path')
if not mt or not fp:
raise EPubException("<rootfile/> element malformed")
self[mt] = fp
class OCF(object):

View File

@ -340,8 +340,7 @@ def render_jacket(mi, output_profile,
if hr_tag is not None:
hr_tag.extract()
return strip_encoding_declarations(
soup.renderContents('utf-8').decode('utf-8'))
return strip_encoding_declarations(soup.decode_contents())
from calibre.ebooks.oeb.base import RECOVER_PARSER

View File

@ -9,7 +9,7 @@ from copy import deepcopy
from xml.sax.saxutils import escape
from calibre import (
prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode)
prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode, xml_replace_entities)
from calibre.constants import isosx, cache_dir
from calibre.customize.conversion import DummyReporter
from calibre.customize.ui import output_profiles
@ -29,6 +29,9 @@ from calibre.utils.localization import get_lang, lang_as_iso639_1
from polyglot.builtins import unicode_type
NBSP = u'\u00a0'
class Formatter(TemplateFormatter):
def get_value(self, key, args, kwargs):
@ -112,7 +115,7 @@ class CatalogBuilder(object):
if self.generate_for_kindle_mobi:
return '&#x25b7;'
else:
return '&nbsp;'
return NBSP
def __init__(self, db, _opts, plugin,
report_progress=DummyReporter(),
@ -1326,7 +1329,7 @@ class CatalogBuilder(object):
"""
# Kindle TOC descriptions won't render certain characters
# Fix up
massaged = unicode_type(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
massaged = xml_replace_entities(unicode_type(description))
# Replace '&' with '&#38;'
massaged = re.sub("&", "&#38;", massaged)
@ -1354,7 +1357,7 @@ class CatalogBuilder(object):
if self.opts.fmt == 'mobi':
codeTag = soup.new_tag("code")
if prefix_char is None:
codeTag.insert(0, NavigableString('&nbsp;'))
codeTag.insert(0, NavigableString(NBSP))
else:
codeTag.insert(0, NavigableString(prefix_char))
return codeTag
@ -1362,7 +1365,7 @@ class CatalogBuilder(object):
spanTag = soup.new_tag("span")
spanTag['class'] = "prefix"
if prefix_char is None:
prefix_char = "&nbsp;"
prefix_char = NBSP
spanTag.insert(0, NavigableString(prefix_char))
return spanTag
@ -2711,7 +2714,7 @@ class CatalogBuilder(object):
if i < len(book['genres']) - 1:
genresTag.insert(gtc, NavigableString(' &middot; '))
gtc += 1
genres = genresTag.renderContents()
genres = genresTag.decode_contents()
# Formats
formats = []
@ -2793,7 +2796,7 @@ class CatalogBuilder(object):
if publisher == ' ':
publisherTag = body.find('td', attrs={'class': 'publisher'})
if publisherTag:
publisherTag.contents[0].replaceWith('&nbsp;')
publisherTag.contents[0].replaceWith(NBSP)
if not genres:
genresTag = body.find('p', attrs={'class': 'genres'})
@ -2808,12 +2811,12 @@ class CatalogBuilder(object):
if note_content == '':
tdTag = body.find('td', attrs={'class': 'notes'})
if tdTag:
tdTag.contents[0].replaceWith('&nbsp;')
tdTag.contents[0].replaceWith(NBSP)
emptyTags = body.findAll('td', attrs={'class': 'empty'})
for mt in emptyTags:
newEmptyTag = soup.new_tag('td')
newEmptyTag.insert(0, '\xa0')
newEmptyTag.insert(0, NBSP)
mt.replaceWith(newEmptyTag)
return soup
@ -2974,7 +2977,7 @@ class CatalogBuilder(object):
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata" version="2005-1" xml:lang="en">
</ncx>
'''
soup = BeautifulStoneSoup(header, selfClosingTags=['content', 'calibre:meta-img'])
soup = BeautifulStoneSoup(header)
ncx = soup.find('ncx')
navMapTag = soup.new_tag('navMap')
@ -4033,7 +4036,7 @@ class CatalogBuilder(object):
</package>
'''.replace('LANG', lang)
# Add the supplied metadata tags
soup = BeautifulStoneSoup(header, selfClosingTags=['item', 'itemref', 'meta', 'reference'])
soup = BeautifulStoneSoup(header)
metadata = soup.find('metadata')
mtc = 0
@ -4171,8 +4174,11 @@ class CatalogBuilder(object):
guide.insert(0, referenceTag)
# Write the OPF file
outfile = open("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'w')
outfile.write(soup.prettify())
output = soup.prettify(encoding='utf-8')
if isinstance(output, unicode_type):
output = output.encode('utf-8')
with lopen("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
outfile.write(output)
def generate_rating_string(self, book):
""" Generate rating string for Descriptions.
@ -4657,7 +4663,7 @@ class CatalogBuilder(object):
elem.extract()
# Reconstruct comments w/o <div>s
comments = soup.renderContents(None)
comments = soup.decode_contents()
# Convert \n\n to <p>s
if re.search('\n\n', comments):
@ -4669,7 +4675,7 @@ class CatalogBuilder(object):
pTag.insert(0, p)
soup.insert(tsc, pTag)
tsc += 1
comments = soup.renderContents(None)
comments = soup.decode_contents()
# Convert solo returns to <br />
comments = re.sub('[\r\n]', '<br />', comments)
@ -4726,7 +4732,7 @@ class CatalogBuilder(object):
result.insert(rtc, elem)
rtc += 1
return result.renderContents(encoding=None)
return result.decode_contents()
def merge_comments(self, record):
""" Merge comments with custom column content.
@ -4954,6 +4960,9 @@ class CatalogBuilder(object):
"""
self.update_progress_full_step(_("Saving NCX"))
ncx = self.ncx_soup.prettify(encoding='utf-8')
if isinstance(ncx, unicode_type):
ncx = ncx.encode('utf-8')
outfile = open("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'w')
outfile.write(self.ncx_soup.prettify())
with lopen("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
outfile.write(ncx)

View File

@ -131,7 +131,7 @@ def comments_to_html(comments):
for t in result.findAll(text=True):
t.replaceWith(prepare_string_for_xml(unicode_type(t)))
return result.renderContents(encoding=None)
return result.decode_contents()
def markdown(val):