Use html5-parser for all metadata download sources

This commit is contained in:
Kovid Goyal 2017-07-08 13:28:09 +05:30
parent 3ebc51b29b
commit 2b78277799
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 50 additions and 19 deletions

View File

@ -30,10 +30,20 @@ class SearchFailed(ValueError):
ua_index = -1 ua_index = -1
def parse_html(raw):
try:
from html5_parser import parse
except ImportError:
# Old versions of calibre
import html5lib
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
else:
return parse(raw)
def parse_details_page(url, log, timeout, browser, domain): def parse_details_page(url, log, timeout, browser, domain):
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
import html5lib
from lxml.html import tostring from lxml.html import tostring
log('Getting details from:', url) log('Getting details from:', url)
try: try:
@ -65,9 +75,8 @@ def parse_details_page(url, log, timeout, browser, domain):
raise ValueError('No cached entry for %s found' % url) raise ValueError('No cached entry for %s found' % url)
try: try:
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', root = parse_html(clean_ascii_chars(raw))
namespaceHTMLElements=False) except Exception:
except:
msg = 'Failed to parse amazon details page: %r' % url msg = 'Failed to parse amazon details page: %r' % url
log.exception(msg) log.exception(msg)
return return
@ -589,8 +598,7 @@ class Worker(Thread): # Get details {{{
if m is not None: if m is not None:
try: try:
text = unquote(m.group(1)).decode('utf-8') text = unquote(m.group(1)).decode('utf-8')
nr = html5lib.parse( nr = parse_html(text)
text, treebuilder='lxml', namespaceHTMLElements=False)
desc = nr.xpath( desc = nr.xpath(
'//div[@id="productDescription"]/*[@class="content"]') '//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
@ -1201,7 +1209,6 @@ class Amazon(Source):
# }}} # }}}
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{ def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
import html5lib
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
matches = [] matches = []
@ -1242,8 +1249,7 @@ class Amazon(Source):
if found: if found:
try: try:
root = html5lib.parse(raw, treebuilder='lxml', root = parse_html(raw)
namespaceHTMLElements=False)
except Exception: except Exception:
msg = 'Failed to parse amazon page for query: %r' % query msg = 'Failed to parse amazon page for query: %r' % query
log.exception(msg) log.exception(msg)

View File

@ -16,14 +16,23 @@ from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.sources.base import Source
def parse_html(raw): def clean_html(raw):
import html5lib
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True, assume_utf8=True)[0]) resolve_entities=True, assume_utf8=True)[0])
return html5lib.parse(raw, treebuilder='lxml',
namespaceHTMLElements=False).getroot()
def parse_html(raw):
raw = clean_html(raw)
try:
from html5_parser import parse
except ImportError:
# Old versions of calibre
import html5lib
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
else:
return parse(raw)
def astext(node): def astext(node):

View File

@ -13,6 +13,17 @@ from calibre import random_user_agent
from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.sources.base import Source, Option
def parse_html(raw):
try:
from html5_parser import parse
except ImportError:
# Old versions of calibre
import html5lib
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
else:
return parse(raw)
class GoogleImages(Source): class GoogleImages(Source):
name = 'Google Images' name = 'Google Images'
@ -55,7 +66,6 @@ class GoogleImages(Source):
def get_image_urls(self, title, author, log, abort, timeout): def get_image_urls(self, title, author, log, abort, timeout):
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from urllib import urlencode from urllib import urlencode
import html5lib
import json import json
from collections import OrderedDict from collections import OrderedDict
ans = OrderedDict() ans = OrderedDict()
@ -72,8 +82,8 @@ class GoogleImages(Source):
# URL scheme # URL scheme
url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz) url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
log('Search URL: ' + url) log('Search URL: ' + url)
raw = br.open(url).read().decode('utf-8') raw = clean_ascii_chars(br.open(url).read().decode('utf-8'))
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) root = parse_html(raw)
for div in root.xpath('//div[@class="rg_meta notranslate"]'): for div in root.xpath('//div[@class="rg_meta notranslate"]'):
try: try:
data = json.loads(div.text) data = json.loads(div.text)

View File

@ -14,7 +14,6 @@ from urlparse import parse_qs
from lxml import etree from lxml import etree
import html5lib
from calibre import browser as _browser, prints, random_user_agent from calibre import browser as _browser, prints, random_user_agent
from calibre.utils.monotonic import monotonic from calibre.utils.monotonic import monotonic
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
@ -48,7 +47,14 @@ def encode_query(**query):
def parse_html(raw): def parse_html(raw):
try:
from html5_parser import parse
except ImportError:
# Old versions of calibre
import html5lib
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
else:
return parse(raw)
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60): def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):