From 69c20527f6399ec189c8e63459255c57cfca81c4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Nov 2011 09:28:34 +0530 Subject: [PATCH] When parsing for lxml via BeatifulSoup, use the calibre modified copy of BeautifulSoup (more robust). Fixes #889890 (Amazon metadata download BeautifulSoup error) --- src/calibre/ebooks/metadata/sources/amazon.py | 7 +- .../ebooks/metadata/sources/overdrive.py | 4 +- src/calibre/ebooks/mobi/reader.py | 6 +- src/calibre/ebooks/oeb/base.py | 4 +- src/calibre/gui2/comments_editor.py | 4 +- src/calibre/utils/soupparser.py | 126 ++++++++++++++++++ 6 files changed, 139 insertions(+), 12 deletions(-) create mode 100644 src/calibre/utils/soupparser.py diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 819cd674fc..52dd109b47 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -12,7 +12,7 @@ from urllib import urlencode from threading import Thread from Queue import Queue, Empty -from lxml.html import soupparser, tostring +from lxml.html import tostring from calibre import as_unicode from calibre.ebooks.metadata import check_isbn @@ -23,6 +23,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.library.comments import sanitize_comments_html from calibre.utils.date import parse_date from calibre.utils.localization import canonicalize_lang +from calibre.utils.soupparser import fromstring class Worker(Thread): # Get details {{{ @@ -199,7 +200,7 @@ class Worker(Thread): # Get details {{{ return try: - root = soupparser.fromstring(clean_ascii_chars(raw)) + root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) @@ -623,7 +624,7 @@ class Amazon(Source): if found: try: - root = soupparser.fromstring(clean_ascii_chars(raw)) + root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse amazon page for query: %r'%query log.exception(msg) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 2e63a2e267..1164567ff5 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -14,13 +14,13 @@ from threading import RLock from Queue import Queue, Empty from lxml import html -from lxml.html import soupparser from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html +from calibre.utils.soupparser import fromstring ovrdrv_data_cache = {} cache_lock = RLock() @@ -403,7 +403,7 @@ class OverDrive(Source): raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: - root = soupparser.fromstring(raw) + root = fromstring(raw) except: return False diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 4e3430b1dc..5d12018121 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -353,14 +353,14 @@ class MobiReader(object): self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): - from lxml.html import soupparser + from calibre.utils.soupparser import fromstring self.log.warning('Malformed markup, parsing using BeautifulSoup') try: - root = soupparser.fromstring(self.processed_html) + root = fromstring(self.processed_html) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) - root = soupparser.fromstring(self.processed_html) + root = fromstring(self.processed_html) if root.tag != 'html': self.log.warn('File does not have opening tag') diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 57720e22f2..0daf0d4e7a 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -894,8 +894,8 @@ class Manifest(object): except etree.XMLSyntaxError as err: self.oeb.logger.warn('Parsing file %r as HTML' % self.href) if err.args and err.args[0].startswith('Excessive depth'): - from lxml.html import soupparser - data = soupparser.fromstring(data) + from calibre.utils.soupparser import fromstring + data = fromstring(data) else: data = html.fromstring(data) data.attrib.pop('xmlns', None) diff --git a/src/calibre/gui2/comments_editor.py b/src/calibre/gui2/comments_editor.py index a594af739e..58ff55e95c 100644 --- a/src/calibre/gui2/comments_editor.py +++ b/src/calibre/gui2/comments_editor.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' import re, os from lxml import html -from lxml.html import soupparser from PyQt4.Qt import QApplication, QFontInfo, QSize, QWidget, QPlainTextEdit, \ QToolBar, QVBoxLayout, QAction, QIcon, Qt, QTabWidget, QUrl, \ @@ -19,6 +18,7 @@ from PyQt4.QtWebKit import QWebView, QWebPage from calibre.ebooks.chardet import xml_to_unicode from calibre import xml_replace_entities from calibre.gui2 import open_url +from calibre.utils.soupparser import fromstring class PageAction(QAction): # {{{ @@ -227,7 +227,7 @@ class EditorWidget(QWebView): # {{{ try: root = html.fromstring(raw) except: - root = soupparser.fromstring(raw) + root = fromstring(raw) elems = [] for body in root.xpath('//body'): diff --git a/src/calibre/utils/soupparser.py b/src/calibre/utils/soupparser.py new file mode 100644 index 0000000000..403f57baad --- /dev/null +++ b/src/calibre/utils/soupparser.py @@ -0,0 +1,126 @@ +__doc__ = """External interface to the BeautifulSoup HTML parser. +""" + +__all__ = ["fromstring", "parse", "convert_tree"] + +from lxml import etree, html +from calibre.ebooks.BeautifulSoup import \ + BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString + + +def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a string of HTML data into an Element tree using the + BeautifulSoup parser. + + Returns the root ```` Element of the tree. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + return _parse(data, beautifulsoup, makeelement, **bsargs) + +def parse(file, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a file into an ElemenTree using the BeautifulSoup parser. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + if not hasattr(file, 'read'): + file = open(file) + root = _parse(file, beautifulsoup, makeelement, **bsargs) + return etree.ElementTree(root) + +def convert_tree(beautiful_soup_tree, makeelement=None): + """Convert a BeautifulSoup tree to a list of Element trees. + + Returns a list instead of a single root Element to support + HTML-like soup with more than one root element. + + You can pass a different Element factory through the `makeelement` + keyword. + """ + if makeelement is None: + makeelement = html.html_parser.makeelement + root = _convert_tree(beautiful_soup_tree, makeelement) + children = root.getchildren() + for child in children: + root.remove(child) + return children + + +# helpers + +def _parse(source, beautifulsoup, makeelement, **bsargs): + if beautifulsoup is None: + beautifulsoup = BeautifulSoup + if makeelement is None: + makeelement = html.html_parser.makeelement + if 'convertEntities' not in bsargs: + bsargs['convertEntities'] = 'html' + tree = beautifulsoup(source, **bsargs) + root = _convert_tree(tree, makeelement) + # from ET: wrap the document in a html root element, if necessary + if len(root) == 1 and root[0].tag == "html": + return root[0] + root.tag = "html" + return root + +def _convert_tree(beautiful_soup_tree, makeelement): + root = makeelement(beautiful_soup_tree.name, + attrib=dict(beautiful_soup_tree.attrs)) + _convert_children(root, beautiful_soup_tree, makeelement) + return root + +def _convert_children(parent, beautiful_soup_tree, makeelement): + SubElement = etree.SubElement + et_child = None + for child in beautiful_soup_tree: + if isinstance(child, Tag): + et_child = SubElement(parent, child.name, attrib=dict( + [(k, unescape(v)) for (k,v) in child.attrs])) + _convert_children(et_child, child, makeelement) + elif type(child) is NavigableString: + _append_text(parent, et_child, unescape(child)) + else: + if isinstance(child, Comment): + parent.append(etree.Comment(child)) + elif isinstance(child, ProcessingInstruction): + parent.append(etree.ProcessingInstruction( + *child.split(' ', 1))) + else: # CData + _append_text(parent, et_child, unescape(child)) + +def _append_text(parent, element, text): + if element is None: + parent.text = (parent.text or '') + text + else: + element.tail = (element.tail or '') + text + + +# copied from ET's ElementSoup + +try: + from html.entities import name2codepoint # Python 3 + name2codepoint +except ImportError: + from htmlentitydefs import name2codepoint +import re + +handle_entities = re.compile("&(\w+);").sub + +def unescape(string): + if not string: + return '' + # work around oddities in BeautifulSoup's entity handling + def unescape_entity(m): + try: + return unichr(name2codepoint[m.group(1)]) + except KeyError: + return m.group(0) # use as is + return handle_entities(unescape_entity, string)