diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 471f7a669a..8b93f6c78b 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -26,7 +26,7 @@ base_url = 'https://search.overdrive.com/' class OverDrive(Source): name = 'Overdrive' - version = (1, 0, 0) + version = (1, 0, 1) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata and covers from Overdrive\'s Content Reserve') @@ -401,9 +401,9 @@ class OverDrive(Source): cover_url) def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): + from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode - from calibre.utils.soupparser import fromstring from calibre.library.comments import sanitize_comments_html try: @@ -415,9 +415,10 @@ class OverDrive(Source): raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] + try: - root = fromstring(raw) - except: + root = parse(raw, maybe_xhtml=False, sanitize_names=True) + except Exception: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index cc04f75d79..43d575c832 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -11,7 +11,7 @@ import shutil, os, re, struct, textwrap, cStringIO from lxml import html, etree from calibre import (xml_entity_to_unicode, entity_to_unicode) -from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.ebooks import DRMError, unit_convert from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.mobi import MobiError @@ -184,30 +184,32 @@ class MobiReader(object): self.cleanup_html() self.log.debug('Parsing HTML...') - self.processed_html = clean_ascii_chars(self.processed_html) + self.processed_html = clean_xml_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) - except: + except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): - from calibre.utils.soupparser import fromstring - self.log.warning('Malformed markup, parsing using BeautifulSoup') + from html5_parser import parse + from calibre.ebooks.chardet import strip_encoding_declarations + self.log.warning('Malformed markup, parsing using html5-parser') + self.processed_html = strip_encoding_declarations(self.processed_html) try: - root = fromstring(self.processed_html) + root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) - root = fromstring(self.processed_html) + root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray s in the markup self.processed_html = self.processed_html.replace('', '') - root = fromstring(self.processed_html) + root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warn('File does not have opening tag') diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index ed80b7d083..28bb10932a 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -107,12 +107,8 @@ def html5_parse(data, max_nesting_depth=100): return data -def _html4_parse(data, prefer_soup=False): - if prefer_soup: - from calibre.utils.soupparser import fromstring - data = fromstring(data) - else: - data = html.fromstring(data) +def _html4_parse(data): + data = html.fromstring(data) data.attrib.pop('xmlns', None) for elem in data.iter(tag=etree.Comment): if elem.text: diff --git a/src/calibre/gui2/comments_editor.py b/src/calibre/gui2/comments_editor.py index b5c9a73606..546422014e 100644 --- a/src/calibre/gui2/comments_editor.py +++ b/src/calibre/gui2/comments_editor.py @@ -24,7 +24,7 @@ from calibre.ebooks.chardet import xml_to_unicode from calibre import xml_replace_entities, prepare_string_for_xml from calibre.gui2 import open_url, error_dialog, choose_files, gprefs, NO_URL_FORMATTING, secure_web_page from calibre.gui2.widgets import LineEditECM -from calibre.utils.soupparser import fromstring +from html5_parser import parse from calibre.utils.config import tweaks from calibre.utils.imghdr import what from polyglot.builtins import unicode_type @@ -355,8 +355,8 @@ class EditorWidget(QWebView, LineEditECM): # {{{ try: root = html.fromstring(raw) - except: - root = fromstring(raw) + except Exception: + root = parse(raw, maybe_xhtml=False, sanitize_names=True) elems = [] for body in root.xpath('//body'): diff --git a/src/calibre/utils/soupparser.py b/src/calibre/utils/soupparser.py deleted file mode 100644 index 8d798385a3..0000000000 --- a/src/calibre/utils/soupparser.py +++ /dev/null @@ -1,134 +0,0 @@ -__doc__ = """External interface to the BeautifulSoup HTML parser. -""" - -__all__ = ["fromstring", "parse", "convert_tree"] - -from lxml import etree, html -from calibre.ebooks.BeautifulSoup import \ - BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString -from polyglot.builtins import codepoint_to_chr - - -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): - """Parse a string of HTML data into an Element tree using the - BeautifulSoup parser. - - Returns the root ```` Element of the tree. - - You can pass a different BeautifulSoup parser through the - `beautifulsoup` keyword, and a diffent Element factory function - through the `makeelement` keyword. By default, the standard - ``BeautifulSoup`` class and the default factory of `lxml.html` are - used. - """ - return _parse(data, beautifulsoup, makeelement, **bsargs) - - -def parse(file, beautifulsoup=None, makeelement=None, **bsargs): - """Parse a file into an ElemenTree using the BeautifulSoup parser. - - You can pass a different BeautifulSoup parser through the - `beautifulsoup` keyword, and a diffent Element factory function - through the `makeelement` keyword. By default, the standard - ``BeautifulSoup`` class and the default factory of `lxml.html` are - used. - """ - if not hasattr(file, 'read'): - file = open(file) - root = _parse(file, beautifulsoup, makeelement, **bsargs) - return etree.ElementTree(root) - - -def convert_tree(beautiful_soup_tree, makeelement=None): - """Convert a BeautifulSoup tree to a list of Element trees. - - Returns a list instead of a single root Element to support - HTML-like soup with more than one root element. - - You can pass a different Element factory through the `makeelement` - keyword. - """ - if makeelement is None: - makeelement = html.html_parser.makeelement - root = _convert_tree(beautiful_soup_tree, makeelement) - children = root.getchildren() - for child in children: - root.remove(child) - return children - - -# helpers - -def _parse(source, beautifulsoup, makeelement, **bsargs): - if beautifulsoup is None: - beautifulsoup = BeautifulSoup - if makeelement is None: - makeelement = html.html_parser.makeelement - if 'convertEntities' not in bsargs: - bsargs['convertEntities'] = 'xhtml' # Changed by Kovid, otherwise ' is mangled, see https://bugs.launchpad.net/calibre/+bug/1197585 - tree = beautifulsoup(source, **bsargs) - root = _convert_tree(tree, makeelement) - # from ET: wrap the document in a html root element, if necessary - if len(root) == 1 and root[0].tag == "html": - return root[0] - root.tag = "html" - return root - - -def _convert_tree(beautiful_soup_tree, makeelement): - root = makeelement(beautiful_soup_tree.name, - attrib=dict(beautiful_soup_tree.attrs)) - _convert_children(root, beautiful_soup_tree, makeelement) - return root - - -def _convert_children(parent, beautiful_soup_tree, makeelement): - SubElement = etree.SubElement - et_child = None - for child in beautiful_soup_tree: - if isinstance(child, Tag): - et_child = SubElement(parent, child.name, attrib=dict( - [(k, unescape(v)) for (k,v) in child.attrs])) - _convert_children(et_child, child, makeelement) - elif type(child) is NavigableString: - _append_text(parent, et_child, unescape(child)) - else: - if isinstance(child, Comment): - parent.append(etree.Comment(child)) - elif isinstance(child, ProcessingInstruction): - parent.append(etree.ProcessingInstruction( - *child.split(' ', 1))) - else: # CData - _append_text(parent, et_child, unescape(child)) - - -def _append_text(parent, element, text): - if element is None: - parent.text = (parent.text or '') + text - else: - element.tail = (element.tail or '') + text - - -# copied from ET's ElementSoup - -try: - from html.entities import name2codepoint # Python 3 - name2codepoint -except ImportError: - from htmlentitydefs import name2codepoint -import re - -handle_entities = re.compile(r"&(\w+);").sub - - -def unescape(string): - if not string: - return '' - # work around oddities in BeautifulSoup's entity handling - - def unescape_entity(m): - try: - return codepoint_to_chr(name2codepoint[m.group(1)]) - except KeyError: - return m.group(0) # use as is - return handle_entities(unescape_entity, string)