Port use of renderContents and BeautifulStoneSoup

This commit is contained in:
Kovid Goyal 2019-03-23 13:31:06 +05:30
parent c89b656df4
commit 256c7563b6
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
27 changed files with 116 additions and 187 deletions

View File

@ -28,10 +28,10 @@ class Adventure_zone(BasicNewsRecipe):
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
skip_tag = soup.body.find(attrs={'class':'subject'}) skip_tag = soup.body.find(attrs={'class':'subject'})
skip_tag = skip_tag.findAll(name='a', href=True) skip_tag = skip_tag.findAll(name='a', href=True)
title = soup.title.renderContents().lower() title = soup.title.renderContents().decode('utf-8').lower()
if self._is_linked_text(title): if self._is_linked_text(title):
for r in skip_tag: for r in skip_tag:
word = r.renderContents() word = r.renderContents().decode('utf-8')
if not word: if not word:
continue continue
word = word.lower() word = word.lower()

View File

@ -104,7 +104,7 @@ class BerlinPolicyJournal(BasicNewsRecipe):
div = soup.find('div', {'class': 'meta-info'}) div = soup.find('div', {'class': 'meta-info'})
authors = '' authors = ''
for entry in div.findAll('span', {'class': 'entry-author'}): for entry in div.findAll('span', {'class': 'entry-author'}):
authors = authors + entry.a.span.renderContents().strip() + ', ' authors = authors + entry.a.span.renderContents().decode('utf-8').strip() + ', '
date = div.find('time').renderContents().strip() date = div.find('time').renderContents().decode('utf-8').strip()
div.replaceWith('<div>' + date + ' | ' + authors[:-2] + '<br/></div>') div.replaceWith('<div>' + date + ' | ' + authors[:-2] + '<br/></div>')
return soup return soup

View File

@ -83,5 +83,5 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe):
br.replaceWith(' ') br.replaceWith(' ')
# remove all links # remove all links
for a in soup.findAll('a'): for a in soup.findAll('a'):
a.replaceWith(a.renderContents()) a.replaceWith(a.renderContents().decode('utf-8'))
return soup return soup

View File

@ -8,7 +8,7 @@ www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -8,7 +8,7 @@ www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -1,5 +1,6 @@
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from lxml import etree
class Ekathimerini(BasicNewsRecipe): class Ekathimerini(BasicNewsRecipe):
@ -41,12 +42,10 @@ class Ekathimerini(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
idx_contents = self.browser.open(self.rss_url).read() idx_contents = self.browser.open(self.rss_url).read()
idx = BeautifulStoneSoup( idx = etree.fromstring(idx_contents)
idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES)
cats = list(set([self.tag_to_string(subcat) cats = sorted({self.tag_to_string(subcat)
for subcat in idx.findAll('subcat')])) for subcat in idx.xpath('//*[local-name()="subcat"]')})
cats.sort()
feeds = [(u'News', list(self.find_articles(idx, u'')))] feeds = [(u'News', list(self.find_articles(idx, u'')))]

View File

@ -52,29 +52,29 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe):
# If there's only one, there is just a link with the dayname. # If there's only one, there is just a link with the dayname.
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
# In that case we're interested in the last two. # In that case we're interested in the last two.
if links[i].renderContents() in dayNames: if links[i].renderContents().decode('utf-8') in dayNames:
# If the link is not in daynames, we processed it already, but if it is, let's see # If the link is not in daynames, we processed it already, but if it is, let's see
# if the next one has '1' as content # if the next one has '1' as content
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1'): if (i + 1 <= maxIndex) and (links[i + 1].renderContents().decode('utf-8') == '1'):
# Got you! Add it to the list # Got you! Add it to the list
article = {'title': links[i].renderContents( article = {'title': links[i].renderContents().decode('utf-8'
) + ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''} )+ ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''}
articles.append(article) articles.append(article)
# If there is a '1', there should be a '2' as well, but # If there is a '1', there should be a '2' as well, but
# better save than sorry # better save than sorry
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2'): if (i + 2 <= maxIndex) and (links[i + 2].renderContents().decode('utf-8') == '2'):
# Got you! Add it to the list # Got you! Add it to the list
article = {'title': links[i].renderContents( article = {'title': links[i].renderContents(
) + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''} ).decode('utf-8') + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
articles.append(article) articles.append(article)
else: else:
# There is only one cartoon for this day. Add it to the # There is only one cartoon for this day. Add it to the
# list. # list.
article = {'title': links[i].renderContents( article = {'title': links[i].renderContents(
), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''} ).decode('utf-8'), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
articles.append(article) articles.append(article)
# Might as well use the weeknumber as title # Might as well use the weeknumber as title
week = index.find('span', attrs={'class': 'week'}).renderContents() week = index.find('span', attrs={'class': 'week'}).renderContents().decode('utf-8')
return [[week, articles]] return [[week, articles]]

View File

@ -46,7 +46,7 @@ class Gildia(BasicNewsRecipe):
words = ('recenzj', 'zapowied', 'fragmen', words = ('recenzj', 'zapowied', 'fragmen',
'relacj', 'wywiad', 'nominacj') 'relacj', 'wywiad', 'nominacj')
document_title = soup.title.renderContents().lower() document_title = soup.title.renderContents().decode('utf-8').lower()
for word in words: for word in words:
if word in document_title: if word in document_title:
for link in content.findAll(name='a'): for link in content.findAll(name='a'):
@ -57,7 +57,7 @@ class Gildia(BasicNewsRecipe):
return self.index_to_soup(tag['href'], raw=True) return self.index_to_soup(tag['href'], raw=True)
def preprocess_html(self, soup): def preprocess_html(self, soup):
title = soup.title.renderContents().lower() title = soup.title.renderContents().decode('utf-8').lower()
for a in soup('a', href=True): for a in soup('a', href=True):
if not a['href'].startswith('http'): if not a['href'].startswith('http'):
if '/gry/' in a['href']: if '/gry/' in a['href']:

View File

@ -129,7 +129,7 @@ class Handelsblatt(BasicNewsRecipe):
def postprocess_html(self, soup, first_fetch): def postprocess_html(self, soup, first_fetch):
# convert lists of author(s) and date(s) into simple text # convert lists of author(s) and date(s) into simple text
for cap in soup.findAll('div', {'class': re.compile('vhb-article-caption')}): for cap in soup.findAll('div', {'class': re.compile('vhb-article-caption')}):
cap.replaceWith(cap.renderContents().strip() + ' ') cap.replaceWith(cap.renderContents().decode('utf-8').strip() + ' ')
for row in soup.findAll('div', {'class': 'vhb-article-author-row'}): for row in soup.findAll('div', {'class': 'vhb-article-author-row'}):
for ul in row.findAll('ul'): for ul in row.findAll('ul'):
entry = '' entry = ''
@ -141,7 +141,7 @@ class Handelsblatt(BasicNewsRecipe):
# remove all local hyperlinks # remove all local hyperlinks
for a in soup.findAll('a', {'href': True}): for a in soup.findAll('a', {'href': True}):
if a['href'] and a['href'][0] in ['/', '#']: if a['href'] and a['href'][0] in ['/', '#']:
a.replaceWith(a.renderContents()) a.replaceWith(a.renderContents().decode('utf-8'))
# make sure that all figure captions (including the source) are shown # make sure that all figure captions (including the source) are shown
# without linebreaks by using the alternative text given within <img/> # without linebreaks by using the alternative text given within <img/>
# instead of the original text (which is oddly formatted) # instead of the original text (which is oddly formatted)

View File

@ -63,7 +63,7 @@ class JoopRecipe(BasicNewsRecipe):
for section in sections: for section in sections:
articles = [] articles = []
h2 = div.find(lambda tag: tag.name == h2 = div.find(lambda tag: tag.name ==
'h2' and tag.renderContents() == section) 'h2' and tag.renderContents().decode('utf-8') == section)
if h2: if h2:
ul = h2.findNextSibling('ul', 'linklist') ul = h2.findNextSibling('ul', 'linklist')
if ul: if ul:

View File

@ -65,14 +65,14 @@ class Mediapart(BasicNewsRecipe):
# print "found fil ",title # print "found fil ",title
article_type = article.find('a', {'href': re.compile( article_type = article.find('a', {'href': re.compile(
r'.*\/type-darticles\/.*')}).renderContents() r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
# print "kind: ",article_type # print "kind: ",article_type
for s in title('span'): for s in title('span'):
s.replaceWith(s.renderContents() + "\n") s.replaceWith(s.renderContents().decode('utf-8') + "\n")
url = title.find('a', href=True)['href'] url = title.find('a', href=True)['href']
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents()) # article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8'))
# print("################################# 9") # print("################################# 9")
# print(article_date) # print(article_date)

View File

@ -8,7 +8,7 @@ www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -76,7 +76,7 @@ class NrcNextRecipe(BasicNewsRecipe):
# In this feed/page articles can be written by more than one author. # In this feed/page articles can be written by more than one author.
# It is nice to see their names in the titles. # It is nice to see their names in the titles.
flag = post.find('h2', attrs={'class': 'vlag'}) flag = post.find('h2', attrs={'class': 'vlag'})
author = flag.contents[0].renderContents() author = flag.contents[0].renderContents().decode('utf-8')
completeTitle = u''.join([author, u': ', title]) completeTitle = u''.join([author, u': ', title])
else: else:
completeTitle = title completeTitle = title

View File

@ -8,7 +8,7 @@ www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -183,15 +183,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -9,7 +9,6 @@ www.canada.com
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
@ -144,15 +143,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -9,7 +9,6 @@ www.canada.com
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
@ -144,15 +143,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -8,7 +8,7 @@ www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -196,15 +196,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -8,7 +8,7 @@ www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -184,15 +184,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -9,7 +9,7 @@ www.canada.com
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import Tag
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -147,15 +147,7 @@ class TimesColonist(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -9,7 +9,6 @@ www.canada.com
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
@ -144,15 +143,7 @@ class CanWestPaper(BasicNewsRecipe):
return fixed return fixed
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters return description
if description:
massaged = unicode(BeautifulStoneSoup(
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&", "&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first: if first:

View File

@ -4,6 +4,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import bs4
from bs4 import ( # noqa from bs4 import ( # noqa
CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag, CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag,
__version__ __version__
@ -27,3 +28,7 @@ def parse_html(markup):
def BeautifulSoup(markup='', *a, **kw): def BeautifulSoup(markup='', *a, **kw):
return parse_html(markup) return parse_html(markup)
def BeautifulStoneSoup(markup='', *a, **kw):
return bs4.BeautifulSoup(markup, 'xml')

View File

@ -39,7 +39,7 @@ def _metadata_from_table(soup, searchfor):
td = td.parent td = td.parent
# there appears to be multiple ways of structuring the metadata # there appears to be multiple ways of structuring the metadata
# on the home page. cue some nasty special-case hacks... # on the home page. cue some nasty special-case hacks...
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I): if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I):
meta = _detag(td.findNextSibling('td')) meta = _detag(td.findNextSibling('td'))
return re.sub('^:', '', meta).strip() return re.sub('^:', '', meta).strip()
else: else:
@ -52,7 +52,7 @@ def _metadata_from_span(soup, searchfor):
if span is None: if span is None:
return None return None
# this metadata might need some cleaning up still :/ # this metadata might need some cleaning up still :/
return _detag(span.renderContents(None).strip()) return _detag(span.decode_contents().strip())
def _get_authors(soup): def _get_authors(soup):

View File

@ -5,35 +5,31 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Compile a LRS file into a LRF file. Compile a LRS file into a LRF file.
''' '''
import sys, os, logging import logging
import os
import sys
from calibre import setup_cli_handlers from calibre import setup_cli_handlers
from calibre.utils.config import OptionParser from calibre.ebooks.BeautifulSoup import (
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \ BeautifulStoneSoup, CData, NavigableString, Tag
CData, Tag )
from calibre.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
DropCaps, Footer, RuledLine
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.lrf.pylrs.pylrs import (
CR, BlockStyle, Bold, Book, BookSetting, Canvas, CharButton, DropCaps, EmpLine,
Font, Footer, Header, Image, ImageBlock, ImageStream, Italic, JumpButton, Page,
PageStyle, Paragraph, Plot, RuledLine, Span, StyleDefault, Sub, Sup, TextBlock,
TextStyle
)
from calibre.utils.config import OptionParser
from polyglot.builtins import string_or_bytes from polyglot.builtins import string_or_bytes
class LrsParser(object): class LrsParser(object):
SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space',
'PutObj', 'RuledLine',
'Plot', 'SetDefault', 'BookSetting', 'RegistFont',
'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo',
'ImageStream', 'Image']]
def __init__(self, stream, logger): def __init__(self, stream, logger):
self.logger = logger self.logger = logger
src = stream.read() src = stream.read()
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0], self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
convertEntities=BeautifulStoneSoup.XML_ENTITIES,
selfClosingTags=self.SELF_CLOSING_TAGS)
self.objects = {} self.objects = {}
for obj in self.soup.findAll(objid=True): for obj in self.soup.findAll(objid=True):
self.objects[obj['objid']] = obj self.objects[obj['objid']] = obj

View File

@ -1,22 +1,29 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
from __future__ import with_statement from __future__ import print_function, with_statement
from __future__ import print_function
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from epub files''' '''Read meta information from epub files'''
import io, os, re, posixpath
import io
import os
import posixpath
import re
from contextlib import closing from contextlib import closing
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace from lxml import etree
from calibre.utils.localunzip import LocalZipFile
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata.opf import get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory
from calibre import CurrentDir, walk from calibre import CurrentDir, walk
from calibre.constants import isosx from calibre.constants import isosx
from calibre.ebooks.metadata.opf import (
get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
)
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.localunzip import LocalZipFile
from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace
class EPubException(Exception): class EPubException(Exception):
@ -36,20 +43,17 @@ class Container(dict):
def __init__(self, stream=None): def __init__(self, stream=None):
if not stream: if not stream:
return return
soup = BeautifulStoneSoup(stream.read()) container = etree.fromstring(stream.read())
container = soup.find(name=re.compile(r'container$', re.I))
if not container:
raise OCFException("<container> element missing")
if container.get('version', None) != '1.0': if container.get('version', None) != '1.0':
raise EPubException("unsupported version of OCF") raise EPubException("unsupported version of OCF")
rootfiles = container.find(re.compile(r'rootfiles$', re.I)) rootfiles = container.xpath('./*[local-name()="rootfiles"]')
if not rootfiles: if not rootfiles:
raise EPubException("<rootfiles/> element missing") raise EPubException("<rootfiles/> element missing")
for rootfile in rootfiles.findAll(re.compile(r'rootfile$', re.I)): for rootfile in rootfiles[0].xpath('./*[local-name()="rootfile"]'):
try: mt, fp = rootfile.get('media-type'), rootfile.get('full-path')
self[rootfile['media-type']] = rootfile['full-path'] if not mt or not fp:
except KeyError:
raise EPubException("<rootfile/> element malformed") raise EPubException("<rootfile/> element malformed")
self[mt] = fp
class OCF(object): class OCF(object):

View File

@ -340,8 +340,7 @@ def render_jacket(mi, output_profile,
if hr_tag is not None: if hr_tag is not None:
hr_tag.extract() hr_tag.extract()
return strip_encoding_declarations( return strip_encoding_declarations(soup.decode_contents())
soup.renderContents('utf-8').decode('utf-8'))
from calibre.ebooks.oeb.base import RECOVER_PARSER from calibre.ebooks.oeb.base import RECOVER_PARSER

View File

@ -9,7 +9,7 @@ from copy import deepcopy
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
from calibre import ( from calibre import (
prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode) prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode, xml_replace_entities)
from calibre.constants import isosx, cache_dir from calibre.constants import isosx, cache_dir
from calibre.customize.conversion import DummyReporter from calibre.customize.conversion import DummyReporter
from calibre.customize.ui import output_profiles from calibre.customize.ui import output_profiles
@ -29,6 +29,9 @@ from calibre.utils.localization import get_lang, lang_as_iso639_1
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
NBSP = u'\u00a0'
class Formatter(TemplateFormatter): class Formatter(TemplateFormatter):
def get_value(self, key, args, kwargs): def get_value(self, key, args, kwargs):
@ -112,7 +115,7 @@ class CatalogBuilder(object):
if self.generate_for_kindle_mobi: if self.generate_for_kindle_mobi:
return '&#x25b7;' return '&#x25b7;'
else: else:
return '&nbsp;' return NBSP
def __init__(self, db, _opts, plugin, def __init__(self, db, _opts, plugin,
report_progress=DummyReporter(), report_progress=DummyReporter(),
@ -1326,7 +1329,7 @@ class CatalogBuilder(object):
""" """
# Kindle TOC descriptions won't render certain characters # Kindle TOC descriptions won't render certain characters
# Fix up # Fix up
massaged = unicode_type(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) massaged = xml_replace_entities(unicode_type(description))
# Replace '&' with '&#38;' # Replace '&' with '&#38;'
massaged = re.sub("&", "&#38;", massaged) massaged = re.sub("&", "&#38;", massaged)
@ -1354,7 +1357,7 @@ class CatalogBuilder(object):
if self.opts.fmt == 'mobi': if self.opts.fmt == 'mobi':
codeTag = soup.new_tag("code") codeTag = soup.new_tag("code")
if prefix_char is None: if prefix_char is None:
codeTag.insert(0, NavigableString('&nbsp;')) codeTag.insert(0, NavigableString(NBSP))
else: else:
codeTag.insert(0, NavigableString(prefix_char)) codeTag.insert(0, NavigableString(prefix_char))
return codeTag return codeTag
@ -1362,7 +1365,7 @@ class CatalogBuilder(object):
spanTag = soup.new_tag("span") spanTag = soup.new_tag("span")
spanTag['class'] = "prefix" spanTag['class'] = "prefix"
if prefix_char is None: if prefix_char is None:
prefix_char = "&nbsp;" prefix_char = NBSP
spanTag.insert(0, NavigableString(prefix_char)) spanTag.insert(0, NavigableString(prefix_char))
return spanTag return spanTag
@ -2711,7 +2714,7 @@ class CatalogBuilder(object):
if i < len(book['genres']) - 1: if i < len(book['genres']) - 1:
genresTag.insert(gtc, NavigableString(' &middot; ')) genresTag.insert(gtc, NavigableString(' &middot; '))
gtc += 1 gtc += 1
genres = genresTag.renderContents() genres = genresTag.decode_contents()
# Formats # Formats
formats = [] formats = []
@ -2793,7 +2796,7 @@ class CatalogBuilder(object):
if publisher == ' ': if publisher == ' ':
publisherTag = body.find('td', attrs={'class': 'publisher'}) publisherTag = body.find('td', attrs={'class': 'publisher'})
if publisherTag: if publisherTag:
publisherTag.contents[0].replaceWith('&nbsp;') publisherTag.contents[0].replaceWith(NBSP)
if not genres: if not genres:
genresTag = body.find('p', attrs={'class': 'genres'}) genresTag = body.find('p', attrs={'class': 'genres'})
@ -2808,12 +2811,12 @@ class CatalogBuilder(object):
if note_content == '': if note_content == '':
tdTag = body.find('td', attrs={'class': 'notes'}) tdTag = body.find('td', attrs={'class': 'notes'})
if tdTag: if tdTag:
tdTag.contents[0].replaceWith('&nbsp;') tdTag.contents[0].replaceWith(NBSP)
emptyTags = body.findAll('td', attrs={'class': 'empty'}) emptyTags = body.findAll('td', attrs={'class': 'empty'})
for mt in emptyTags: for mt in emptyTags:
newEmptyTag = soup.new_tag('td') newEmptyTag = soup.new_tag('td')
newEmptyTag.insert(0, '\xa0') newEmptyTag.insert(0, NBSP)
mt.replaceWith(newEmptyTag) mt.replaceWith(newEmptyTag)
return soup return soup
@ -2974,7 +2977,7 @@ class CatalogBuilder(object):
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata" version="2005-1" xml:lang="en"> <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata" version="2005-1" xml:lang="en">
</ncx> </ncx>
''' '''
soup = BeautifulStoneSoup(header, selfClosingTags=['content', 'calibre:meta-img']) soup = BeautifulStoneSoup(header)
ncx = soup.find('ncx') ncx = soup.find('ncx')
navMapTag = soup.new_tag('navMap') navMapTag = soup.new_tag('navMap')
@ -4033,7 +4036,7 @@ class CatalogBuilder(object):
</package> </package>
'''.replace('LANG', lang) '''.replace('LANG', lang)
# Add the supplied metadata tags # Add the supplied metadata tags
soup = BeautifulStoneSoup(header, selfClosingTags=['item', 'itemref', 'meta', 'reference']) soup = BeautifulStoneSoup(header)
metadata = soup.find('metadata') metadata = soup.find('metadata')
mtc = 0 mtc = 0
@ -4171,8 +4174,11 @@ class CatalogBuilder(object):
guide.insert(0, referenceTag) guide.insert(0, referenceTag)
# Write the OPF file # Write the OPF file
outfile = open("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'w') output = soup.prettify(encoding='utf-8')
outfile.write(soup.prettify()) if isinstance(output, unicode_type):
output = output.encode('utf-8')
with lopen("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
outfile.write(output)
def generate_rating_string(self, book): def generate_rating_string(self, book):
""" Generate rating string for Descriptions. """ Generate rating string for Descriptions.
@ -4657,7 +4663,7 @@ class CatalogBuilder(object):
elem.extract() elem.extract()
# Reconstruct comments w/o <div>s # Reconstruct comments w/o <div>s
comments = soup.renderContents(None) comments = soup.decode_contents()
# Convert \n\n to <p>s # Convert \n\n to <p>s
if re.search('\n\n', comments): if re.search('\n\n', comments):
@ -4669,7 +4675,7 @@ class CatalogBuilder(object):
pTag.insert(0, p) pTag.insert(0, p)
soup.insert(tsc, pTag) soup.insert(tsc, pTag)
tsc += 1 tsc += 1
comments = soup.renderContents(None) comments = soup.decode_contents()
# Convert solo returns to <br /> # Convert solo returns to <br />
comments = re.sub('[\r\n]', '<br />', comments) comments = re.sub('[\r\n]', '<br />', comments)
@ -4726,7 +4732,7 @@ class CatalogBuilder(object):
result.insert(rtc, elem) result.insert(rtc, elem)
rtc += 1 rtc += 1
return result.renderContents(encoding=None) return result.decode_contents()
def merge_comments(self, record): def merge_comments(self, record):
""" Merge comments with custom column content. """ Merge comments with custom column content.
@ -4954,6 +4960,9 @@ class CatalogBuilder(object):
""" """
self.update_progress_full_step(_("Saving NCX")) self.update_progress_full_step(_("Saving NCX"))
ncx = self.ncx_soup.prettify(encoding='utf-8')
if isinstance(ncx, unicode_type):
ncx = ncx.encode('utf-8')
outfile = open("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'w') with lopen("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
outfile.write(self.ncx_soup.prettify()) outfile.write(ncx)

View File

@ -131,7 +131,7 @@ def comments_to_html(comments):
for t in result.findAll(text=True): for t in result.findAll(text=True):
t.replaceWith(prepare_string_for_xml(unicode_type(t))) t.replaceWith(prepare_string_for_xml(unicode_type(t)))
return result.renderContents(encoding=None) return result.decode_contents()
def markdown(val): def markdown(val):