diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index ece81f0a2c..59c9924f18 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -30,10 +30,20 @@ class SearchFailed(ValueError): ua_index = -1 +def parse_html(raw): + try: + from html5_parser import parse + except ImportError: + # Old versions of calibre + import html5lib + return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + else: + return parse(raw) + + def parse_details_page(url, log, timeout, browser, domain): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode - import html5lib from lxml.html import tostring log('Getting details from:', url) try: @@ -65,9 +75,8 @@ def parse_details_page(url, log, timeout, browser, domain): raise ValueError('No cached entry for %s found' % url) try: - root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', - namespaceHTMLElements=False) - except: + root = parse_html(clean_ascii_chars(raw)) + except Exception: msg = 'Failed to parse amazon details page: %r' % url log.exception(msg) return @@ -589,8 +598,7 @@ class Worker(Thread): # Get details {{{ if m is not None: try: text = unquote(m.group(1)).decode('utf-8') - nr = html5lib.parse( - text, treebuilder='lxml', namespaceHTMLElements=False) + nr = parse_html(text) desc = nr.xpath( '//div[@id="productDescription"]/*[@class="content"]') if desc: @@ -1201,7 +1209,6 @@ class Amazon(Source): # }}} def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{ - import html5lib from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode matches = [] @@ -1242,8 +1249,7 @@ class Amazon(Source): if found: try: - root = html5lib.parse(raw, treebuilder='lxml', - namespaceHTMLElements=False) + root = parse_html(raw) except Exception: msg = 'Failed to parse amazon page for query: %r' % query log.exception(msg) diff --git a/src/calibre/ebooks/metadata/sources/edelweiss.py b/src/calibre/ebooks/metadata/sources/edelweiss.py index a7a29ca93c..1f47d90a82 100644 --- a/src/calibre/ebooks/metadata/sources/edelweiss.py +++ b/src/calibre/ebooks/metadata/sources/edelweiss.py @@ -16,14 +16,23 @@ from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Source -def parse_html(raw): - import html5lib +def clean_html(raw): from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars - raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, + return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]) - return html5lib.parse(raw, treebuilder='lxml', - namespaceHTMLElements=False).getroot() + + +def parse_html(raw): + raw = clean_html(raw) + try: + from html5_parser import parse + except ImportError: + # Old versions of calibre + import html5lib + return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + else: + return parse(raw) def astext(node): diff --git a/src/calibre/ebooks/metadata/sources/google_images.py b/src/calibre/ebooks/metadata/sources/google_images.py index 436540ace5..843a5d4107 100644 --- a/src/calibre/ebooks/metadata/sources/google_images.py +++ b/src/calibre/ebooks/metadata/sources/google_images.py @@ -13,6 +13,17 @@ from calibre import random_user_agent from calibre.ebooks.metadata.sources.base import Source, Option +def parse_html(raw): + try: + from html5_parser import parse + except ImportError: + # Old versions of calibre + import html5lib + return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + else: + return parse(raw) + + class GoogleImages(Source): name = 'Google Images' @@ -55,7 +66,6 @@ class GoogleImages(Source): def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars from urllib import urlencode - import html5lib import json from collections import OrderedDict ans = OrderedDict() @@ -72,8 +82,8 @@ class GoogleImages(Source): # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz) log('Search URL: ' + url) - raw = br.open(url).read().decode('utf-8') - root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) + raw = clean_ascii_chars(br.open(url).read().decode('utf-8')) + root = parse_html(raw) for div in root.xpath('//div[@class="rg_meta notranslate"]'): try: data = json.loads(div.text) diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index fca9aa3750..87dd8d788a 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -14,7 +14,6 @@ from urlparse import parse_qs from lxml import etree -import html5lib from calibre import browser as _browser, prints, random_user_agent from calibre.utils.monotonic import monotonic from calibre.utils.random_ua import accept_header_for_ua @@ -48,7 +47,14 @@ def encode_query(**query): def parse_html(raw): - return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + try: + from html5_parser import parse + except ImportError: + # Old versions of calibre + import html5lib + return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + else: + return parse(raw) def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):