When parsing for lxml via BeatifulSoup, use the calibre modified copy of BeautifulSoup (more robust). Fixes #889890 (Amazon metadata download BeautifulSoup error)

This commit is contained in:
Kovid Goyal 2011-11-14 09:28:34 +05:30
parent 8b3698ac26
commit 69c20527f6
6 changed files with 139 additions and 12 deletions

View File

@ -12,7 +12,7 @@ from urllib import urlencode
from threading import Thread from threading import Thread
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml.html import soupparser, tostring from lxml.html import tostring
from calibre import as_unicode from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
@ -23,6 +23,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
from calibre.utils.date import parse_date from calibre.utils.date import parse_date
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.soupparser import fromstring
class Worker(Thread): # Get details {{{ class Worker(Thread): # Get details {{{
@ -199,7 +200,7 @@ class Worker(Thread): # Get details {{{
return return
try: try:
root = soupparser.fromstring(clean_ascii_chars(raw)) root = fromstring(clean_ascii_chars(raw))
except: except:
msg = 'Failed to parse amazon details page: %r'%self.url msg = 'Failed to parse amazon details page: %r'%self.url
self.log.exception(msg) self.log.exception(msg)
@ -623,7 +624,7 @@ class Amazon(Source):
if found: if found:
try: try:
root = soupparser.fromstring(clean_ascii_chars(raw)) root = fromstring(clean_ascii_chars(raw))
except: except:
msg = 'Failed to parse amazon page for query: %r'%query msg = 'Failed to parse amazon page for query: %r'%query
log.exception(msg) log.exception(msg)

View File

@ -14,13 +14,13 @@ from threading import RLock
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml import html from lxml import html
from lxml.html import soupparser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.sources.base import Source, Option
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
from calibre.utils.soupparser import fromstring
ovrdrv_data_cache = {} ovrdrv_data_cache = {}
cache_lock = RLock() cache_lock = RLock()
@ -403,7 +403,7 @@ class OverDrive(Source):
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
try: try:
root = soupparser.fromstring(raw) root = fromstring(raw)
except: except:
return False return False

View File

@ -353,14 +353,14 @@ class MobiReader(object):
self.processed_html = self.remove_random_bytes(self.processed_html) self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html) root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'): if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser from calibre.utils.soupparser import fromstring
self.log.warning('Malformed markup, parsing using BeautifulSoup') self.log.warning('Malformed markup, parsing using BeautifulSoup')
try: try:
root = soupparser.fromstring(self.processed_html) root = fromstring(self.processed_html)
except Exception: except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html) self.processed_html = self.remove_random_bytes(self.processed_html)
root = soupparser.fromstring(self.processed_html) root = fromstring(self.processed_html)
if root.tag != 'html': if root.tag != 'html':
self.log.warn('File does not have opening <html> tag') self.log.warn('File does not have opening <html> tag')

View File

@ -894,8 +894,8 @@ class Manifest(object):
except etree.XMLSyntaxError as err: except etree.XMLSyntaxError as err:
self.oeb.logger.warn('Parsing file %r as HTML' % self.href) self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
if err.args and err.args[0].startswith('Excessive depth'): if err.args and err.args[0].startswith('Excessive depth'):
from lxml.html import soupparser from calibre.utils.soupparser import fromstring
data = soupparser.fromstring(data) data = fromstring(data)
else: else:
data = html.fromstring(data) data = html.fromstring(data)
data.attrib.pop('xmlns', None) data.attrib.pop('xmlns', None)

View File

@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
import re, os import re, os
from lxml import html from lxml import html
from lxml.html import soupparser
from PyQt4.Qt import QApplication, QFontInfo, QSize, QWidget, QPlainTextEdit, \ from PyQt4.Qt import QApplication, QFontInfo, QSize, QWidget, QPlainTextEdit, \
QToolBar, QVBoxLayout, QAction, QIcon, Qt, QTabWidget, QUrl, \ QToolBar, QVBoxLayout, QAction, QIcon, Qt, QTabWidget, QUrl, \
@ -19,6 +18,7 @@ from PyQt4.QtWebKit import QWebView, QWebPage
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre import xml_replace_entities from calibre import xml_replace_entities
from calibre.gui2 import open_url from calibre.gui2 import open_url
from calibre.utils.soupparser import fromstring
class PageAction(QAction): # {{{ class PageAction(QAction): # {{{
@ -227,7 +227,7 @@ class EditorWidget(QWebView): # {{{
try: try:
root = html.fromstring(raw) root = html.fromstring(raw)
except: except:
root = soupparser.fromstring(raw) root = fromstring(raw)
elems = [] elems = []
for body in root.xpath('//body'): for body in root.xpath('//body'):

View File

@ -0,0 +1,126 @@
__doc__ = """External interface to the BeautifulSoup HTML parser.
"""
__all__ = ["fromstring", "parse", "convert_tree"]
from lxml import etree, html
from calibre.ebooks.BeautifulSoup import \
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a string of HTML data into an Element tree using the
BeautifulSoup parser.
Returns the root ``<html>`` Element of the tree.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
return _parse(data, beautifulsoup, makeelement, **bsargs)
def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a file into an ElemenTree using the BeautifulSoup parser.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
if not hasattr(file, 'read'):
file = open(file)
root = _parse(file, beautifulsoup, makeelement, **bsargs)
return etree.ElementTree(root)
def convert_tree(beautiful_soup_tree, makeelement=None):
"""Convert a BeautifulSoup tree to a list of Element trees.
Returns a list instead of a single root Element to support
HTML-like soup with more than one root element.
You can pass a different Element factory through the `makeelement`
keyword.
"""
if makeelement is None:
makeelement = html.html_parser.makeelement
root = _convert_tree(beautiful_soup_tree, makeelement)
children = root.getchildren()
for child in children:
root.remove(child)
return children
# helpers
def _parse(source, beautifulsoup, makeelement, **bsargs):
if beautifulsoup is None:
beautifulsoup = BeautifulSoup
if makeelement is None:
makeelement = html.html_parser.makeelement
if 'convertEntities' not in bsargs:
bsargs['convertEntities'] = 'html'
tree = beautifulsoup(source, **bsargs)
root = _convert_tree(tree, makeelement)
# from ET: wrap the document in a html root element, if necessary
if len(root) == 1 and root[0].tag == "html":
return root[0]
root.tag = "html"
return root
def _convert_tree(beautiful_soup_tree, makeelement):
root = makeelement(beautiful_soup_tree.name,
attrib=dict(beautiful_soup_tree.attrs))
_convert_children(root, beautiful_soup_tree, makeelement)
return root
def _convert_children(parent, beautiful_soup_tree, makeelement):
SubElement = etree.SubElement
et_child = None
for child in beautiful_soup_tree:
if isinstance(child, Tag):
et_child = SubElement(parent, child.name, attrib=dict(
[(k, unescape(v)) for (k,v) in child.attrs]))
_convert_children(et_child, child, makeelement)
elif type(child) is NavigableString:
_append_text(parent, et_child, unescape(child))
else:
if isinstance(child, Comment):
parent.append(etree.Comment(child))
elif isinstance(child, ProcessingInstruction):
parent.append(etree.ProcessingInstruction(
*child.split(' ', 1)))
else: # CData
_append_text(parent, et_child, unescape(child))
def _append_text(parent, element, text):
if element is None:
parent.text = (parent.text or '') + text
else:
element.tail = (element.tail or '') + text
# copied from ET's ElementSoup
try:
from html.entities import name2codepoint # Python 3
name2codepoint
except ImportError:
from htmlentitydefs import name2codepoint
import re
handle_entities = re.compile("&(\w+);").sub
def unescape(string):
if not string:
return ''
# work around oddities in BeautifulSoup's entity handling
def unescape_entity(m):
try:
return unichr(name2codepoint[m.group(1)])
except KeyError:
return m.group(0) # use as is
return handle_entities(unescape_entity, string)