diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index ee5fd065ae..afb8e080d9 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -28,10 +28,10 @@ class Adventure_zone(BasicNewsRecipe): def skip_ad_pages(self, soup): skip_tag = soup.body.find(attrs={'class':'subject'}) skip_tag = skip_tag.findAll(name='a', href=True) - title = soup.title.renderContents().lower() + title = soup.title.renderContents().decode('utf-8').lower() if self._is_linked_text(title): for r in skip_tag: - word = r.renderContents() + word = r.renderContents().decode('utf-8') if not word: continue word = word.lower() diff --git a/recipes/berlin_policy_journal.recipe b/recipes/berlin_policy_journal.recipe index 6f9a2146b6..9e832c90a5 100644 --- a/recipes/berlin_policy_journal.recipe +++ b/recipes/berlin_policy_journal.recipe @@ -104,7 +104,7 @@ class BerlinPolicyJournal(BasicNewsRecipe): div = soup.find('div', {'class': 'meta-info'}) authors = '' for entry in div.findAll('span', {'class': 'entry-author'}): - authors = authors + entry.a.span.renderContents().strip() + ', ' - date = div.find('time').renderContents().strip() + authors = authors + entry.a.span.renderContents().decode('utf-8').strip() + ', ' + date = div.find('time').renderContents().decode('utf-8').strip() div.replaceWith('
' + date + ' | ' + authors[:-2] + '
') return soup diff --git a/recipes/bild_de.recipe b/recipes/bild_de.recipe index 4ecf47b119..ad2ccd9e32 100644 --- a/recipes/bild_de.recipe +++ b/recipes/bild_de.recipe @@ -83,5 +83,5 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe): br.replaceWith(' ') # remove all links for a in soup.findAll('a'): - a.replaceWith(a.renderContents()) + a.replaceWith(a.renderContents().decode('utf-8')) return soup diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index a0e3be0642..a6c9dd74ff 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -8,7 +8,7 @@ www.canada.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag +from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): @@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/edmonton_journal.recipe b/recipes/edmonton_journal.recipe index 5e6b671dd3..40dc953feb 100644 --- a/recipes/edmonton_journal.recipe +++ b/recipes/edmonton_journal.recipe @@ -8,7 +8,7 @@ www.canada.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag +from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): @@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/ekathemerini.recipe b/recipes/ekathemerini.recipe index 191d653343..094b1953dc 100644 --- a/recipes/ekathemerini.recipe +++ b/recipes/ekathemerini.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from lxml import etree class Ekathimerini(BasicNewsRecipe): @@ -41,12 +42,10 @@ class Ekathimerini(BasicNewsRecipe): def parse_index(self): idx_contents = self.browser.open(self.rss_url).read() - idx = BeautifulStoneSoup( - idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES) + idx = etree.fromstring(idx_contents) - cats = list(set([self.tag_to_string(subcat) - for subcat in idx.findAll('subcat')])) - cats.sort() + cats = sorted({self.tag_to_string(subcat) + for subcat in idx.xpath('//*[local-name()="subcat"]')}) feeds = [(u'News', list(self.find_articles(idx, u'')))] diff --git a/recipes/fokkeensukke.recipe b/recipes/fokkeensukke.recipe index d8fac65ff1..47e695b68e 100644 --- a/recipes/fokkeensukke.recipe +++ b/recipes/fokkeensukke.recipe @@ -52,29 +52,29 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe): # If there's only one, there is just a link with the dayname. # If there are two, there are three links in sequence: dayname 1 2. # In that case we're interested in the last two. - if links[i].renderContents() in dayNames: + if links[i].renderContents().decode('utf-8') in dayNames: # If the link is not in daynames, we processed it already, but if it is, let's see # if the next one has '1' as content - if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1'): + if (i + 1 <= maxIndex) and (links[i + 1].renderContents().decode('utf-8') == '1'): # Got you! Add it to the list - article = {'title': links[i].renderContents( - ) + ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''} + article = {'title': links[i].renderContents().decode('utf-8' + )+ ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''} articles.append(article) # If there is a '1', there should be a '2' as well, but # better save than sorry - if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2'): + if (i + 2 <= maxIndex) and (links[i + 2].renderContents().decode('utf-8') == '2'): # Got you! Add it to the list article = {'title': links[i].renderContents( - ) + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''} + ).decode('utf-8') + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''} articles.append(article) else: # There is only one cartoon for this day. Add it to the # list. article = {'title': links[i].renderContents( - ), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''} + ).decode('utf-8'), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''} articles.append(article) # Might as well use the weeknumber as title - week = index.find('span', attrs={'class': 'week'}).renderContents() + week = index.find('span', attrs={'class': 'week'}).renderContents().decode('utf-8') return [[week, articles]] diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 4cdf03ad28..4a6325edf5 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -46,7 +46,7 @@ class Gildia(BasicNewsRecipe): words = ('recenzj', 'zapowied', 'fragmen', 'relacj', 'wywiad', 'nominacj') - document_title = soup.title.renderContents().lower() + document_title = soup.title.renderContents().decode('utf-8').lower() for word in words: if word in document_title: for link in content.findAll(name='a'): @@ -57,7 +57,7 @@ class Gildia(BasicNewsRecipe): return self.index_to_soup(tag['href'], raw=True) def preprocess_html(self, soup): - title = soup.title.renderContents().lower() + title = soup.title.renderContents().decode('utf-8').lower() for a in soup('a', href=True): if not a['href'].startswith('http'): if '/gry/' in a['href']: diff --git a/recipes/handelsblatt.recipe b/recipes/handelsblatt.recipe index 516e6891cb..90dfb7b5d3 100644 --- a/recipes/handelsblatt.recipe +++ b/recipes/handelsblatt.recipe @@ -129,7 +129,7 @@ class Handelsblatt(BasicNewsRecipe): def postprocess_html(self, soup, first_fetch): # convert lists of author(s) and date(s) into simple text for cap in soup.findAll('div', {'class': re.compile('vhb-article-caption')}): - cap.replaceWith(cap.renderContents().strip() + ' ') + cap.replaceWith(cap.renderContents().decode('utf-8').strip() + ' ') for row in soup.findAll('div', {'class': 'vhb-article-author-row'}): for ul in row.findAll('ul'): entry = '' @@ -141,7 +141,7 @@ class Handelsblatt(BasicNewsRecipe): # remove all local hyperlinks for a in soup.findAll('a', {'href': True}): if a['href'] and a['href'][0] in ['/', '#']: - a.replaceWith(a.renderContents()) + a.replaceWith(a.renderContents().decode('utf-8')) # make sure that all figure captions (including the source) are shown # without linebreaks by using the alternative text given within # instead of the original text (which is oddly formatted) diff --git a/recipes/joop.recipe b/recipes/joop.recipe index 4cbdf39eca..09f967680a 100644 --- a/recipes/joop.recipe +++ b/recipes/joop.recipe @@ -63,7 +63,7 @@ class JoopRecipe(BasicNewsRecipe): for section in sections: articles = [] h2 = div.find(lambda tag: tag.name == - 'h2' and tag.renderContents() == section) + 'h2' and tag.renderContents().decode('utf-8') == section) if h2: ul = h2.findNextSibling('ul', 'linklist') if ul: diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 727b0bbd7a..f9ca833f58 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -65,14 +65,14 @@ class Mediapart(BasicNewsRecipe): # print "found fil ",title article_type = article.find('a', {'href': re.compile( - r'.*\/type-darticles\/.*')}).renderContents() + r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8') # print "kind: ",article_type for s in title('span'): - s.replaceWith(s.renderContents() + "\n") + s.replaceWith(s.renderContents().decode('utf-8') + "\n") url = title.find('a', href=True)['href'] - # article_date = self.parse_french_date(article.find("span", "article-date").renderContents()) + # article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8')) # print("################################# 9") # print(article_date) diff --git a/recipes/montreal_gazette.recipe b/recipes/montreal_gazette.recipe index 456f082dfd..f5e7d6dccb 100644 --- a/recipes/montreal_gazette.recipe +++ b/recipes/montreal_gazette.recipe @@ -8,7 +8,7 @@ www.canada.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag +from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): @@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/ncrnext.recipe b/recipes/ncrnext.recipe index a28879028f..ec3087d9b4 100644 --- a/recipes/ncrnext.recipe +++ b/recipes/ncrnext.recipe @@ -76,7 +76,7 @@ class NrcNextRecipe(BasicNewsRecipe): # In this feed/page articles can be written by more than one author. # It is nice to see their names in the titles. flag = post.find('h2', attrs={'class': 'vlag'}) - author = flag.contents[0].renderContents() + author = flag.contents[0].renderContents().decode('utf-8') completeTitle = u''.join([author, u': ', title]) else: completeTitle = title diff --git a/recipes/ottawa_citizen.recipe b/recipes/ottawa_citizen.recipe index ac5f4ca441..43762f067d 100644 --- a/recipes/ottawa_citizen.recipe +++ b/recipes/ottawa_citizen.recipe @@ -8,7 +8,7 @@ www.canada.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag +from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): @@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/regina_leader_post.recipe b/recipes/regina_leader_post.recipe index 1b928de984..122e6a7cdd 100644 --- a/recipes/regina_leader_post.recipe +++ b/recipes/regina_leader_post.recipe @@ -9,7 +9,6 @@ www.canada.com import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup class CanWestPaper(BasicNewsRecipe): @@ -144,15 +143,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/saskatoon_star_phoenix.recipe b/recipes/saskatoon_star_phoenix.recipe index c78fef6160..23602e93bd 100644 --- a/recipes/saskatoon_star_phoenix.recipe +++ b/recipes/saskatoon_star_phoenix.recipe @@ -9,7 +9,6 @@ www.canada.com import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup class CanWestPaper(BasicNewsRecipe): @@ -144,15 +143,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/vancouver_province.recipe b/recipes/vancouver_province.recipe index 39ce9681e6..78082c1046 100644 --- a/recipes/vancouver_province.recipe +++ b/recipes/vancouver_province.recipe @@ -8,7 +8,7 @@ www.canada.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag +from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): @@ -196,15 +196,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/vancouver_sun.recipe b/recipes/vancouver_sun.recipe index 45e2b8f9b5..92acc06e90 100644 --- a/recipes/vancouver_sun.recipe +++ b/recipes/vancouver_sun.recipe @@ -8,7 +8,7 @@ www.canada.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag +from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): @@ -184,15 +184,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/vic_times.recipe b/recipes/vic_times.recipe index 148985c43e..7e25596c0a 100644 --- a/recipes/vic_times.recipe +++ b/recipes/vic_times.recipe @@ -9,7 +9,7 @@ www.canada.com import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup +from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): @@ -147,15 +147,7 @@ class TimesColonist(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/recipes/windsor_star.recipe b/recipes/windsor_star.recipe index b1837120e5..e02c4f507b 100644 --- a/recipes/windsor_star.recipe +++ b/recipes/windsor_star.recipe @@ -9,7 +9,6 @@ www.canada.com import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup class CanWestPaper(BasicNewsRecipe): @@ -144,15 +143,7 @@ class CanWestPaper(BasicNewsRecipe): return fixed def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description + return description def populate_article_metadata(self, article, soup, first): if first: diff --git a/src/calibre/ebooks/BeautifulSoup.py b/src/calibre/ebooks/BeautifulSoup.py index 5c15536efb..b0913ba2cf 100644 --- a/src/calibre/ebooks/BeautifulSoup.py +++ b/src/calibre/ebooks/BeautifulSoup.py @@ -4,6 +4,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals +import bs4 from bs4 import ( # noqa CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag, __version__ @@ -27,3 +28,7 @@ def parse_html(markup): def BeautifulSoup(markup='', *a, **kw): return parse_html(markup) + + +def BeautifulStoneSoup(markup='', *a, **kw): + return bs4.BeautifulSoup(markup, 'xml') diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 5169765cfe..d91e7b2d63 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -39,7 +39,7 @@ def _metadata_from_table(soup, searchfor): td = td.parent # there appears to be multiple ways of structuring the metadata # on the home page. cue some nasty special-case hacks... - if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I): + if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I): meta = _detag(td.findNextSibling('td')) return re.sub('^:', '', meta).strip() else: @@ -52,7 +52,7 @@ def _metadata_from_span(soup, searchfor): if span is None: return None # this metadata might need some cleaning up still :/ - return _detag(span.renderContents(None).strip()) + return _detag(span.decode_contents().strip()) def _get_authors(soup): diff --git a/src/calibre/ebooks/lrf/lrs/convert_from.py b/src/calibre/ebooks/lrf/lrs/convert_from.py index ed0088f1e5..c412ac4814 100644 --- a/src/calibre/ebooks/lrf/lrs/convert_from.py +++ b/src/calibre/ebooks/lrf/lrs/convert_from.py @@ -5,35 +5,31 @@ __copyright__ = '2008, Kovid Goyal ' Compile a LRS file into a LRF file. ''' -import sys, os, logging +import logging +import os +import sys from calibre import setup_cli_handlers -from calibre.utils.config import OptionParser -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \ - CData, Tag -from calibre.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \ - BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \ - Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \ - Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \ - DropCaps, Footer, RuledLine +from calibre.ebooks.BeautifulSoup import ( + BeautifulStoneSoup, CData, NavigableString, Tag +) from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.lrf.pylrs.pylrs import ( + CR, BlockStyle, Bold, Book, BookSetting, Canvas, CharButton, DropCaps, EmpLine, + Font, Footer, Header, Image, ImageBlock, ImageStream, Italic, JumpButton, Page, + PageStyle, Paragraph, Plot, RuledLine, Span, StyleDefault, Sub, Sup, TextBlock, + TextStyle +) +from calibre.utils.config import OptionParser from polyglot.builtins import string_or_bytes class LrsParser(object): - SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space', - 'PutObj', 'RuledLine', - 'Plot', 'SetDefault', 'BookSetting', 'RegistFont', - 'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo', - 'ImageStream', 'Image']] - def __init__(self, stream, logger): self.logger = logger src = stream.read() - self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0], - convertEntities=BeautifulStoneSoup.XML_ENTITIES, - selfClosingTags=self.SELF_CLOSING_TAGS) + self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0]) self.objects = {} for obj in self.soup.findAll(objid=True): self.objects[obj['objid']] = obj diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index dfbf875dc5..39fd5390fa 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -1,22 +1,29 @@ #!/usr/bin/env python2 -from __future__ import with_statement -from __future__ import print_function +from __future__ import print_function, with_statement + __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' '''Read meta information from epub files''' -import io, os, re, posixpath + +import io +import os +import posixpath +import re from contextlib import closing -from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace -from calibre.utils.localunzip import LocalZipFile -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup -from calibre.ebooks.metadata.opf import get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf -from calibre.ebooks.metadata.opf2 import OPF -from calibre.ptempfile import TemporaryDirectory +from lxml import etree + from calibre import CurrentDir, walk from calibre.constants import isosx +from calibre.ebooks.metadata.opf import ( + get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf +) +from calibre.ebooks.metadata.opf2 import OPF +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.localunzip import LocalZipFile +from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace class EPubException(Exception): @@ -36,20 +43,17 @@ class Container(dict): def __init__(self, stream=None): if not stream: return - soup = BeautifulStoneSoup(stream.read()) - container = soup.find(name=re.compile(r'container$', re.I)) - if not container: - raise OCFException(" element missing") + container = etree.fromstring(stream.read()) if container.get('version', None) != '1.0': raise EPubException("unsupported version of OCF") - rootfiles = container.find(re.compile(r'rootfiles$', re.I)) + rootfiles = container.xpath('./*[local-name()="rootfiles"]') if not rootfiles: raise EPubException(" element missing") - for rootfile in rootfiles.findAll(re.compile(r'rootfile$', re.I)): - try: - self[rootfile['media-type']] = rootfile['full-path'] - except KeyError: + for rootfile in rootfiles[0].xpath('./*[local-name()="rootfile"]'): + mt, fp = rootfile.get('media-type'), rootfile.get('full-path') + if not mt or not fp: raise EPubException(" element malformed") + self[mt] = fp class OCF(object): diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index 27f08536b4..c631420bff 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -340,8 +340,7 @@ def render_jacket(mi, output_profile, if hr_tag is not None: hr_tag.extract() - return strip_encoding_declarations( - soup.renderContents('utf-8').decode('utf-8')) + return strip_encoding_declarations(soup.decode_contents()) from calibre.ebooks.oeb.base import RECOVER_PARSER diff --git a/src/calibre/library/catalogs/epub_mobi_builder.py b/src/calibre/library/catalogs/epub_mobi_builder.py index 914404a346..3aa4e2d149 100644 --- a/src/calibre/library/catalogs/epub_mobi_builder.py +++ b/src/calibre/library/catalogs/epub_mobi_builder.py @@ -9,7 +9,7 @@ from copy import deepcopy from xml.sax.saxutils import escape from calibre import ( - prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode) + prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode, xml_replace_entities) from calibre.constants import isosx, cache_dir from calibre.customize.conversion import DummyReporter from calibre.customize.ui import output_profiles @@ -29,6 +29,9 @@ from calibre.utils.localization import get_lang, lang_as_iso639_1 from polyglot.builtins import unicode_type +NBSP = u'\u00a0' + + class Formatter(TemplateFormatter): def get_value(self, key, args, kwargs): @@ -112,7 +115,7 @@ class CatalogBuilder(object): if self.generate_for_kindle_mobi: return '▷' else: - return ' ' + return NBSP def __init__(self, db, _opts, plugin, report_progress=DummyReporter(), @@ -1326,7 +1329,7 @@ class CatalogBuilder(object): """ # Kindle TOC descriptions won't render certain characters # Fix up - massaged = unicode_type(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + massaged = xml_replace_entities(unicode_type(description)) # Replace '&' with '&' massaged = re.sub("&", "&", massaged) @@ -1354,7 +1357,7 @@ class CatalogBuilder(object): if self.opts.fmt == 'mobi': codeTag = soup.new_tag("code") if prefix_char is None: - codeTag.insert(0, NavigableString(' ')) + codeTag.insert(0, NavigableString(NBSP)) else: codeTag.insert(0, NavigableString(prefix_char)) return codeTag @@ -1362,7 +1365,7 @@ class CatalogBuilder(object): spanTag = soup.new_tag("span") spanTag['class'] = "prefix" if prefix_char is None: - prefix_char = " " + prefix_char = NBSP spanTag.insert(0, NavigableString(prefix_char)) return spanTag @@ -2711,7 +2714,7 @@ class CatalogBuilder(object): if i < len(book['genres']) - 1: genresTag.insert(gtc, NavigableString(' · ')) gtc += 1 - genres = genresTag.renderContents() + genres = genresTag.decode_contents() # Formats formats = [] @@ -2793,7 +2796,7 @@ class CatalogBuilder(object): if publisher == ' ': publisherTag = body.find('td', attrs={'class': 'publisher'}) if publisherTag: - publisherTag.contents[0].replaceWith(' ') + publisherTag.contents[0].replaceWith(NBSP) if not genres: genresTag = body.find('p', attrs={'class': 'genres'}) @@ -2808,12 +2811,12 @@ class CatalogBuilder(object): if note_content == '': tdTag = body.find('td', attrs={'class': 'notes'}) if tdTag: - tdTag.contents[0].replaceWith(' ') + tdTag.contents[0].replaceWith(NBSP) emptyTags = body.findAll('td', attrs={'class': 'empty'}) for mt in emptyTags: newEmptyTag = soup.new_tag('td') - newEmptyTag.insert(0, '\xa0') + newEmptyTag.insert(0, NBSP) mt.replaceWith(newEmptyTag) return soup @@ -2974,7 +2977,7 @@ class CatalogBuilder(object): ''' - soup = BeautifulStoneSoup(header, selfClosingTags=['content', 'calibre:meta-img']) + soup = BeautifulStoneSoup(header) ncx = soup.find('ncx') navMapTag = soup.new_tag('navMap') @@ -4033,7 +4036,7 @@ class CatalogBuilder(object): '''.replace('LANG', lang) # Add the supplied metadata tags - soup = BeautifulStoneSoup(header, selfClosingTags=['item', 'itemref', 'meta', 'reference']) + soup = BeautifulStoneSoup(header) metadata = soup.find('metadata') mtc = 0 @@ -4171,8 +4174,11 @@ class CatalogBuilder(object): guide.insert(0, referenceTag) # Write the OPF file - outfile = open("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'w') - outfile.write(soup.prettify()) + output = soup.prettify(encoding='utf-8') + if isinstance(output, unicode_type): + output = output.encode('utf-8') + with lopen("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'wb') as outfile: + outfile.write(output) def generate_rating_string(self, book): """ Generate rating string for Descriptions. @@ -4657,7 +4663,7 @@ class CatalogBuilder(object): elem.extract() # Reconstruct comments w/o
s - comments = soup.renderContents(None) + comments = soup.decode_contents() # Convert \n\n to

s if re.search('\n\n', comments): @@ -4669,7 +4675,7 @@ class CatalogBuilder(object): pTag.insert(0, p) soup.insert(tsc, pTag) tsc += 1 - comments = soup.renderContents(None) + comments = soup.decode_contents() # Convert solo returns to
comments = re.sub('[\r\n]', '
', comments) @@ -4726,7 +4732,7 @@ class CatalogBuilder(object): result.insert(rtc, elem) rtc += 1 - return result.renderContents(encoding=None) + return result.decode_contents() def merge_comments(self, record): """ Merge comments with custom column content. @@ -4954,6 +4960,9 @@ class CatalogBuilder(object): """ self.update_progress_full_step(_("Saving NCX")) + ncx = self.ncx_soup.prettify(encoding='utf-8') + if isinstance(ncx, unicode_type): + ncx = ncx.encode('utf-8') - outfile = open("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'w') - outfile.write(self.ncx_soup.prettify()) + with lopen("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'wb') as outfile: + outfile.write(ncx) diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index ec4279363f..5a176cb53c 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -131,7 +131,7 @@ def comments_to_html(comments): for t in result.findAll(text=True): t.replaceWith(prepare_string_for_xml(unicode_type(t))) - return result.renderContents(encoding=None) + return result.decode_contents() def markdown(val):