mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Port use of renderContents and BeautifulStoneSoup
This commit is contained in:
parent
c89b656df4
commit
256c7563b6
@ -28,10 +28,10 @@ class Adventure_zone(BasicNewsRecipe):
|
||||
def skip_ad_pages(self, soup):
|
||||
skip_tag = soup.body.find(attrs={'class':'subject'})
|
||||
skip_tag = skip_tag.findAll(name='a', href=True)
|
||||
title = soup.title.renderContents().lower()
|
||||
title = soup.title.renderContents().decode('utf-8').lower()
|
||||
if self._is_linked_text(title):
|
||||
for r in skip_tag:
|
||||
word = r.renderContents()
|
||||
word = r.renderContents().decode('utf-8')
|
||||
if not word:
|
||||
continue
|
||||
word = word.lower()
|
||||
|
@ -104,7 +104,7 @@ class BerlinPolicyJournal(BasicNewsRecipe):
|
||||
div = soup.find('div', {'class': 'meta-info'})
|
||||
authors = ''
|
||||
for entry in div.findAll('span', {'class': 'entry-author'}):
|
||||
authors = authors + entry.a.span.renderContents().strip() + ', '
|
||||
date = div.find('time').renderContents().strip()
|
||||
authors = authors + entry.a.span.renderContents().decode('utf-8').strip() + ', '
|
||||
date = div.find('time').renderContents().decode('utf-8').strip()
|
||||
div.replaceWith('<div>' + date + ' | ' + authors[:-2] + '<br/></div>')
|
||||
return soup
|
||||
|
@ -83,5 +83,5 @@ class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
br.replaceWith(' ')
|
||||
# remove all links
|
||||
for a in soup.findAll('a'):
|
||||
a.replaceWith(a.renderContents())
|
||||
a.replaceWith(a.renderContents().decode('utf-8'))
|
||||
return soup
|
||||
|
@ -8,7 +8,7 @@ www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -8,7 +8,7 @@ www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -1,5 +1,6 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from lxml import etree
|
||||
|
||||
|
||||
class Ekathimerini(BasicNewsRecipe):
|
||||
@ -41,12 +42,10 @@ class Ekathimerini(BasicNewsRecipe):
|
||||
|
||||
def parse_index(self):
|
||||
idx_contents = self.browser.open(self.rss_url).read()
|
||||
idx = BeautifulStoneSoup(
|
||||
idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES)
|
||||
idx = etree.fromstring(idx_contents)
|
||||
|
||||
cats = list(set([self.tag_to_string(subcat)
|
||||
for subcat in idx.findAll('subcat')]))
|
||||
cats.sort()
|
||||
cats = sorted({self.tag_to_string(subcat)
|
||||
for subcat in idx.xpath('//*[local-name()="subcat"]')})
|
||||
|
||||
feeds = [(u'News', list(self.find_articles(idx, u'')))]
|
||||
|
||||
|
@ -52,29 +52,29 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe):
|
||||
# If there's only one, there is just a link with the dayname.
|
||||
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
|
||||
# In that case we're interested in the last two.
|
||||
if links[i].renderContents() in dayNames:
|
||||
if links[i].renderContents().decode('utf-8') in dayNames:
|
||||
# If the link is not in daynames, we processed it already, but if it is, let's see
|
||||
# if the next one has '1' as content
|
||||
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1'):
|
||||
if (i + 1 <= maxIndex) and (links[i + 1].renderContents().decode('utf-8') == '1'):
|
||||
# Got you! Add it to the list
|
||||
article = {'title': links[i].renderContents(
|
||||
article = {'title': links[i].renderContents().decode('utf-8'
|
||||
)+ ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''}
|
||||
articles.append(article)
|
||||
# If there is a '1', there should be a '2' as well, but
|
||||
# better save than sorry
|
||||
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2'):
|
||||
if (i + 2 <= maxIndex) and (links[i + 2].renderContents().decode('utf-8') == '2'):
|
||||
# Got you! Add it to the list
|
||||
article = {'title': links[i].renderContents(
|
||||
) + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
|
||||
).decode('utf-8') + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
|
||||
articles.append(article)
|
||||
else:
|
||||
# There is only one cartoon for this day. Add it to the
|
||||
# list.
|
||||
article = {'title': links[i].renderContents(
|
||||
), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
|
||||
).decode('utf-8'), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
|
||||
articles.append(article)
|
||||
# Might as well use the weeknumber as title
|
||||
week = index.find('span', attrs={'class': 'week'}).renderContents()
|
||||
week = index.find('span', attrs={'class': 'week'}).renderContents().decode('utf-8')
|
||||
|
||||
return [[week, articles]]
|
||||
|
||||
|
@ -46,7 +46,7 @@ class Gildia(BasicNewsRecipe):
|
||||
|
||||
words = ('recenzj', 'zapowied', 'fragmen',
|
||||
'relacj', 'wywiad', 'nominacj')
|
||||
document_title = soup.title.renderContents().lower()
|
||||
document_title = soup.title.renderContents().decode('utf-8').lower()
|
||||
for word in words:
|
||||
if word in document_title:
|
||||
for link in content.findAll(name='a'):
|
||||
@ -57,7 +57,7 @@ class Gildia(BasicNewsRecipe):
|
||||
return self.index_to_soup(tag['href'], raw=True)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
title = soup.title.renderContents().lower()
|
||||
title = soup.title.renderContents().decode('utf-8').lower()
|
||||
for a in soup('a', href=True):
|
||||
if not a['href'].startswith('http'):
|
||||
if '/gry/' in a['href']:
|
||||
|
@ -129,7 +129,7 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
# convert lists of author(s) and date(s) into simple text
|
||||
for cap in soup.findAll('div', {'class': re.compile('vhb-article-caption')}):
|
||||
cap.replaceWith(cap.renderContents().strip() + ' ')
|
||||
cap.replaceWith(cap.renderContents().decode('utf-8').strip() + ' ')
|
||||
for row in soup.findAll('div', {'class': 'vhb-article-author-row'}):
|
||||
for ul in row.findAll('ul'):
|
||||
entry = ''
|
||||
@ -141,7 +141,7 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
# remove all local hyperlinks
|
||||
for a in soup.findAll('a', {'href': True}):
|
||||
if a['href'] and a['href'][0] in ['/', '#']:
|
||||
a.replaceWith(a.renderContents())
|
||||
a.replaceWith(a.renderContents().decode('utf-8'))
|
||||
# make sure that all figure captions (including the source) are shown
|
||||
# without linebreaks by using the alternative text given within <img/>
|
||||
# instead of the original text (which is oddly formatted)
|
||||
|
@ -63,7 +63,7 @@ class JoopRecipe(BasicNewsRecipe):
|
||||
for section in sections:
|
||||
articles = []
|
||||
h2 = div.find(lambda tag: tag.name ==
|
||||
'h2' and tag.renderContents() == section)
|
||||
'h2' and tag.renderContents().decode('utf-8') == section)
|
||||
if h2:
|
||||
ul = h2.findNextSibling('ul', 'linklist')
|
||||
if ul:
|
||||
|
@ -65,14 +65,14 @@ class Mediapart(BasicNewsRecipe):
|
||||
|
||||
# print "found fil ",title
|
||||
article_type = article.find('a', {'href': re.compile(
|
||||
r'.*\/type-darticles\/.*')}).renderContents()
|
||||
r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
|
||||
# print "kind: ",article_type
|
||||
|
||||
for s in title('span'):
|
||||
s.replaceWith(s.renderContents() + "\n")
|
||||
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
|
||||
url = title.find('a', href=True)['href']
|
||||
|
||||
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
|
||||
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8'))
|
||||
# print("################################# 9")
|
||||
# print(article_date)
|
||||
|
||||
|
@ -8,7 +8,7 @@ www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -76,7 +76,7 @@ class NrcNextRecipe(BasicNewsRecipe):
|
||||
# In this feed/page articles can be written by more than one author.
|
||||
# It is nice to see their names in the titles.
|
||||
flag = post.find('h2', attrs={'class': 'vlag'})
|
||||
author = flag.contents[0].renderContents()
|
||||
author = flag.contents[0].renderContents().decode('utf-8')
|
||||
completeTitle = u''.join([author, u': ', title])
|
||||
else:
|
||||
completeTitle = title
|
||||
|
@ -8,7 +8,7 @@ www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
@ -183,14 +183,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -9,7 +9,6 @@ www.canada.com
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -9,7 +9,6 @@ www.canada.com
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -8,7 +8,7 @@ www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
@ -196,14 +196,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -8,7 +8,7 @@ www.canada.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
@ -184,14 +184,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -9,7 +9,7 @@ www.canada.com
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
@ -147,14 +147,6 @@ class TimesColonist(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -9,7 +9,6 @@ www.canada.com
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
@ -144,14 +143,6 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(
|
||||
description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import bs4
|
||||
from bs4 import ( # noqa
|
||||
CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag,
|
||||
__version__
|
||||
@ -27,3 +28,7 @@ def parse_html(markup):
|
||||
|
||||
def BeautifulSoup(markup='', *a, **kw):
|
||||
return parse_html(markup)
|
||||
|
||||
|
||||
def BeautifulStoneSoup(markup='', *a, **kw):
|
||||
return bs4.BeautifulSoup(markup, 'xml')
|
||||
|
@ -39,7 +39,7 @@ def _metadata_from_table(soup, searchfor):
|
||||
td = td.parent
|
||||
# there appears to be multiple ways of structuring the metadata
|
||||
# on the home page. cue some nasty special-case hacks...
|
||||
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I):
|
||||
if re.match(r'^\s*'+searchfor+r'\s*$', td.decode_contents(), flags=re.I):
|
||||
meta = _detag(td.findNextSibling('td'))
|
||||
return re.sub('^:', '', meta).strip()
|
||||
else:
|
||||
@ -52,7 +52,7 @@ def _metadata_from_span(soup, searchfor):
|
||||
if span is None:
|
||||
return None
|
||||
# this metadata might need some cleaning up still :/
|
||||
return _detag(span.renderContents(None).strip())
|
||||
return _detag(span.decode_contents().strip())
|
||||
|
||||
|
||||
def _get_authors(soup):
|
||||
|
@ -5,35 +5,31 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
Compile a LRS file into a LRF file.
|
||||
'''
|
||||
|
||||
import sys, os, logging
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
from calibre import setup_cli_handlers
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
|
||||
CData, Tag
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
|
||||
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
|
||||
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
|
||||
Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
|
||||
DropCaps, Footer, RuledLine
|
||||
from calibre.ebooks.BeautifulSoup import (
|
||||
BeautifulStoneSoup, CData, NavigableString, Tag
|
||||
)
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import (
|
||||
CR, BlockStyle, Bold, Book, BookSetting, Canvas, CharButton, DropCaps, EmpLine,
|
||||
Font, Footer, Header, Image, ImageBlock, ImageStream, Italic, JumpButton, Page,
|
||||
PageStyle, Paragraph, Plot, RuledLine, Span, StyleDefault, Sub, Sup, TextBlock,
|
||||
TextStyle
|
||||
)
|
||||
from calibre.utils.config import OptionParser
|
||||
from polyglot.builtins import string_or_bytes
|
||||
|
||||
|
||||
class LrsParser(object):
|
||||
|
||||
SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space',
|
||||
'PutObj', 'RuledLine',
|
||||
'Plot', 'SetDefault', 'BookSetting', 'RegistFont',
|
||||
'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo',
|
||||
'ImageStream', 'Image']]
|
||||
|
||||
def __init__(self, stream, logger):
|
||||
self.logger = logger
|
||||
src = stream.read()
|
||||
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
|
||||
convertEntities=BeautifulStoneSoup.XML_ENTITIES,
|
||||
selfClosingTags=self.SELF_CLOSING_TAGS)
|
||||
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
|
||||
self.objects = {}
|
||||
for obj in self.soup.findAll(objid=True):
|
||||
self.objects[obj['objid']] = obj
|
||||
|
@ -1,22 +1,29 @@
|
||||
#!/usr/bin/env python2
|
||||
from __future__ import with_statement
|
||||
from __future__ import print_function
|
||||
from __future__ import print_function, with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''Read meta information from epub files'''
|
||||
|
||||
import io, os, re, posixpath
|
||||
|
||||
import io
|
||||
import os
|
||||
import posixpath
|
||||
import re
|
||||
from contextlib import closing
|
||||
|
||||
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
|
||||
from calibre.utils.localunzip import LocalZipFile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from calibre.ebooks.metadata.opf import get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from lxml import etree
|
||||
|
||||
from calibre import CurrentDir, walk
|
||||
from calibre.constants import isosx
|
||||
from calibre.ebooks.metadata.opf import (
|
||||
get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
|
||||
)
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.localunzip import LocalZipFile
|
||||
from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace
|
||||
|
||||
|
||||
class EPubException(Exception):
|
||||
@ -36,20 +43,17 @@ class Container(dict):
|
||||
def __init__(self, stream=None):
|
||||
if not stream:
|
||||
return
|
||||
soup = BeautifulStoneSoup(stream.read())
|
||||
container = soup.find(name=re.compile(r'container$', re.I))
|
||||
if not container:
|
||||
raise OCFException("<container> element missing")
|
||||
container = etree.fromstring(stream.read())
|
||||
if container.get('version', None) != '1.0':
|
||||
raise EPubException("unsupported version of OCF")
|
||||
rootfiles = container.find(re.compile(r'rootfiles$', re.I))
|
||||
rootfiles = container.xpath('./*[local-name()="rootfiles"]')
|
||||
if not rootfiles:
|
||||
raise EPubException("<rootfiles/> element missing")
|
||||
for rootfile in rootfiles.findAll(re.compile(r'rootfile$', re.I)):
|
||||
try:
|
||||
self[rootfile['media-type']] = rootfile['full-path']
|
||||
except KeyError:
|
||||
for rootfile in rootfiles[0].xpath('./*[local-name()="rootfile"]'):
|
||||
mt, fp = rootfile.get('media-type'), rootfile.get('full-path')
|
||||
if not mt or not fp:
|
||||
raise EPubException("<rootfile/> element malformed")
|
||||
self[mt] = fp
|
||||
|
||||
|
||||
class OCF(object):
|
||||
|
@ -340,8 +340,7 @@ def render_jacket(mi, output_profile,
|
||||
if hr_tag is not None:
|
||||
hr_tag.extract()
|
||||
|
||||
return strip_encoding_declarations(
|
||||
soup.renderContents('utf-8').decode('utf-8'))
|
||||
return strip_encoding_declarations(soup.decode_contents())
|
||||
|
||||
from calibre.ebooks.oeb.base import RECOVER_PARSER
|
||||
|
||||
|
@ -9,7 +9,7 @@ from copy import deepcopy
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
from calibre import (
|
||||
prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode)
|
||||
prepare_string_for_xml, strftime, force_unicode, isbytestring, replace_entities, as_unicode, xml_replace_entities)
|
||||
from calibre.constants import isosx, cache_dir
|
||||
from calibre.customize.conversion import DummyReporter
|
||||
from calibre.customize.ui import output_profiles
|
||||
@ -29,6 +29,9 @@ from calibre.utils.localization import get_lang, lang_as_iso639_1
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
NBSP = u'\u00a0'
|
||||
|
||||
|
||||
class Formatter(TemplateFormatter):
|
||||
|
||||
def get_value(self, key, args, kwargs):
|
||||
@ -112,7 +115,7 @@ class CatalogBuilder(object):
|
||||
if self.generate_for_kindle_mobi:
|
||||
return '▷'
|
||||
else:
|
||||
return ' '
|
||||
return NBSP
|
||||
|
||||
def __init__(self, db, _opts, plugin,
|
||||
report_progress=DummyReporter(),
|
||||
@ -1326,7 +1329,7 @@ class CatalogBuilder(object):
|
||||
"""
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
# Fix up
|
||||
massaged = unicode_type(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
massaged = xml_replace_entities(unicode_type(description))
|
||||
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&", "&", massaged)
|
||||
@ -1354,7 +1357,7 @@ class CatalogBuilder(object):
|
||||
if self.opts.fmt == 'mobi':
|
||||
codeTag = soup.new_tag("code")
|
||||
if prefix_char is None:
|
||||
codeTag.insert(0, NavigableString(' '))
|
||||
codeTag.insert(0, NavigableString(NBSP))
|
||||
else:
|
||||
codeTag.insert(0, NavigableString(prefix_char))
|
||||
return codeTag
|
||||
@ -1362,7 +1365,7 @@ class CatalogBuilder(object):
|
||||
spanTag = soup.new_tag("span")
|
||||
spanTag['class'] = "prefix"
|
||||
if prefix_char is None:
|
||||
prefix_char = " "
|
||||
prefix_char = NBSP
|
||||
spanTag.insert(0, NavigableString(prefix_char))
|
||||
return spanTag
|
||||
|
||||
@ -2711,7 +2714,7 @@ class CatalogBuilder(object):
|
||||
if i < len(book['genres']) - 1:
|
||||
genresTag.insert(gtc, NavigableString(' · '))
|
||||
gtc += 1
|
||||
genres = genresTag.renderContents()
|
||||
genres = genresTag.decode_contents()
|
||||
|
||||
# Formats
|
||||
formats = []
|
||||
@ -2793,7 +2796,7 @@ class CatalogBuilder(object):
|
||||
if publisher == ' ':
|
||||
publisherTag = body.find('td', attrs={'class': 'publisher'})
|
||||
if publisherTag:
|
||||
publisherTag.contents[0].replaceWith(' ')
|
||||
publisherTag.contents[0].replaceWith(NBSP)
|
||||
|
||||
if not genres:
|
||||
genresTag = body.find('p', attrs={'class': 'genres'})
|
||||
@ -2808,12 +2811,12 @@ class CatalogBuilder(object):
|
||||
if note_content == '':
|
||||
tdTag = body.find('td', attrs={'class': 'notes'})
|
||||
if tdTag:
|
||||
tdTag.contents[0].replaceWith(' ')
|
||||
tdTag.contents[0].replaceWith(NBSP)
|
||||
|
||||
emptyTags = body.findAll('td', attrs={'class': 'empty'})
|
||||
for mt in emptyTags:
|
||||
newEmptyTag = soup.new_tag('td')
|
||||
newEmptyTag.insert(0, '\xa0')
|
||||
newEmptyTag.insert(0, NBSP)
|
||||
mt.replaceWith(newEmptyTag)
|
||||
|
||||
return soup
|
||||
@ -2974,7 +2977,7 @@ class CatalogBuilder(object):
|
||||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata" version="2005-1" xml:lang="en">
|
||||
</ncx>
|
||||
'''
|
||||
soup = BeautifulStoneSoup(header, selfClosingTags=['content', 'calibre:meta-img'])
|
||||
soup = BeautifulStoneSoup(header)
|
||||
|
||||
ncx = soup.find('ncx')
|
||||
navMapTag = soup.new_tag('navMap')
|
||||
@ -4033,7 +4036,7 @@ class CatalogBuilder(object):
|
||||
</package>
|
||||
'''.replace('LANG', lang)
|
||||
# Add the supplied metadata tags
|
||||
soup = BeautifulStoneSoup(header, selfClosingTags=['item', 'itemref', 'meta', 'reference'])
|
||||
soup = BeautifulStoneSoup(header)
|
||||
metadata = soup.find('metadata')
|
||||
mtc = 0
|
||||
|
||||
@ -4171,8 +4174,11 @@ class CatalogBuilder(object):
|
||||
guide.insert(0, referenceTag)
|
||||
|
||||
# Write the OPF file
|
||||
outfile = open("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'w')
|
||||
outfile.write(soup.prettify())
|
||||
output = soup.prettify(encoding='utf-8')
|
||||
if isinstance(output, unicode_type):
|
||||
output = output.encode('utf-8')
|
||||
with lopen("%s/%s.opf" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
|
||||
outfile.write(output)
|
||||
|
||||
def generate_rating_string(self, book):
|
||||
""" Generate rating string for Descriptions.
|
||||
@ -4657,7 +4663,7 @@ class CatalogBuilder(object):
|
||||
elem.extract()
|
||||
|
||||
# Reconstruct comments w/o <div>s
|
||||
comments = soup.renderContents(None)
|
||||
comments = soup.decode_contents()
|
||||
|
||||
# Convert \n\n to <p>s
|
||||
if re.search('\n\n', comments):
|
||||
@ -4669,7 +4675,7 @@ class CatalogBuilder(object):
|
||||
pTag.insert(0, p)
|
||||
soup.insert(tsc, pTag)
|
||||
tsc += 1
|
||||
comments = soup.renderContents(None)
|
||||
comments = soup.decode_contents()
|
||||
|
||||
# Convert solo returns to <br />
|
||||
comments = re.sub('[\r\n]', '<br />', comments)
|
||||
@ -4726,7 +4732,7 @@ class CatalogBuilder(object):
|
||||
result.insert(rtc, elem)
|
||||
rtc += 1
|
||||
|
||||
return result.renderContents(encoding=None)
|
||||
return result.decode_contents()
|
||||
|
||||
def merge_comments(self, record):
|
||||
""" Merge comments with custom column content.
|
||||
@ -4954,6 +4960,9 @@ class CatalogBuilder(object):
|
||||
"""
|
||||
|
||||
self.update_progress_full_step(_("Saving NCX"))
|
||||
ncx = self.ncx_soup.prettify(encoding='utf-8')
|
||||
if isinstance(ncx, unicode_type):
|
||||
ncx = ncx.encode('utf-8')
|
||||
|
||||
outfile = open("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'w')
|
||||
outfile.write(self.ncx_soup.prettify())
|
||||
with lopen("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'wb') as outfile:
|
||||
outfile.write(ncx)
|
||||
|
@ -131,7 +131,7 @@ def comments_to_html(comments):
|
||||
for t in result.findAll(text=True):
|
||||
t.replaceWith(prepare_string_for_xml(unicode_type(t)))
|
||||
|
||||
return result.renderContents(encoding=None)
|
||||
return result.decode_contents()
|
||||
|
||||
|
||||
def markdown(val):
|
||||
|
Loading…
x
Reference in New Issue
Block a user