mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Use html5-parser for all metadata download sources
This commit is contained in:
parent
3ebc51b29b
commit
2b78277799
@ -30,10 +30,20 @@ class SearchFailed(ValueError):
|
|||||||
ua_index = -1
|
ua_index = -1
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html(raw):
|
||||||
|
try:
|
||||||
|
from html5_parser import parse
|
||||||
|
except ImportError:
|
||||||
|
# Old versions of calibre
|
||||||
|
import html5lib
|
||||||
|
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
else:
|
||||||
|
return parse(raw)
|
||||||
|
|
||||||
|
|
||||||
def parse_details_page(url, log, timeout, browser, domain):
|
def parse_details_page(url, log, timeout, browser, domain):
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
import html5lib
|
|
||||||
from lxml.html import tostring
|
from lxml.html import tostring
|
||||||
log('Getting details from:', url)
|
log('Getting details from:', url)
|
||||||
try:
|
try:
|
||||||
@ -65,9 +75,8 @@ def parse_details_page(url, log, timeout, browser, domain):
|
|||||||
raise ValueError('No cached entry for %s found' % url)
|
raise ValueError('No cached entry for %s found' % url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
|
root = parse_html(clean_ascii_chars(raw))
|
||||||
namespaceHTMLElements=False)
|
except Exception:
|
||||||
except:
|
|
||||||
msg = 'Failed to parse amazon details page: %r' % url
|
msg = 'Failed to parse amazon details page: %r' % url
|
||||||
log.exception(msg)
|
log.exception(msg)
|
||||||
return
|
return
|
||||||
@ -589,8 +598,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
if m is not None:
|
if m is not None:
|
||||||
try:
|
try:
|
||||||
text = unquote(m.group(1)).decode('utf-8')
|
text = unquote(m.group(1)).decode('utf-8')
|
||||||
nr = html5lib.parse(
|
nr = parse_html(text)
|
||||||
text, treebuilder='lxml', namespaceHTMLElements=False)
|
|
||||||
desc = nr.xpath(
|
desc = nr.xpath(
|
||||||
'//div[@id="productDescription"]/*[@class="content"]')
|
'//div[@id="productDescription"]/*[@class="content"]')
|
||||||
if desc:
|
if desc:
|
||||||
@ -1201,7 +1209,6 @@ class Amazon(Source):
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
|
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
|
||||||
import html5lib
|
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
matches = []
|
matches = []
|
||||||
@ -1242,8 +1249,7 @@ class Amazon(Source):
|
|||||||
|
|
||||||
if found:
|
if found:
|
||||||
try:
|
try:
|
||||||
root = html5lib.parse(raw, treebuilder='lxml',
|
root = parse_html(raw)
|
||||||
namespaceHTMLElements=False)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
msg = 'Failed to parse amazon page for query: %r' % query
|
msg = 'Failed to parse amazon page for query: %r' % query
|
||||||
log.exception(msg)
|
log.exception(msg)
|
||||||
|
@ -16,14 +16,23 @@ from calibre.ebooks.metadata import check_isbn
|
|||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
|
|
||||||
|
|
||||||
def parse_html(raw):
|
def clean_html(raw):
|
||||||
import html5lib
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,
|
return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True, assume_utf8=True)[0])
|
resolve_entities=True, assume_utf8=True)[0])
|
||||||
return html5lib.parse(raw, treebuilder='lxml',
|
|
||||||
namespaceHTMLElements=False).getroot()
|
|
||||||
|
def parse_html(raw):
|
||||||
|
raw = clean_html(raw)
|
||||||
|
try:
|
||||||
|
from html5_parser import parse
|
||||||
|
except ImportError:
|
||||||
|
# Old versions of calibre
|
||||||
|
import html5lib
|
||||||
|
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
else:
|
||||||
|
return parse(raw)
|
||||||
|
|
||||||
|
|
||||||
def astext(node):
|
def astext(node):
|
||||||
|
@ -13,6 +13,17 @@ from calibre import random_user_agent
|
|||||||
from calibre.ebooks.metadata.sources.base import Source, Option
|
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html(raw):
|
||||||
|
try:
|
||||||
|
from html5_parser import parse
|
||||||
|
except ImportError:
|
||||||
|
# Old versions of calibre
|
||||||
|
import html5lib
|
||||||
|
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
else:
|
||||||
|
return parse(raw)
|
||||||
|
|
||||||
|
|
||||||
class GoogleImages(Source):
|
class GoogleImages(Source):
|
||||||
|
|
||||||
name = 'Google Images'
|
name = 'Google Images'
|
||||||
@ -55,7 +66,6 @@ class GoogleImages(Source):
|
|||||||
def get_image_urls(self, title, author, log, abort, timeout):
|
def get_image_urls(self, title, author, log, abort, timeout):
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
import html5lib
|
|
||||||
import json
|
import json
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
ans = OrderedDict()
|
ans = OrderedDict()
|
||||||
@ -72,8 +82,8 @@ class GoogleImages(Source):
|
|||||||
# URL scheme
|
# URL scheme
|
||||||
url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
|
url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
|
||||||
log('Search URL: ' + url)
|
log('Search URL: ' + url)
|
||||||
raw = br.open(url).read().decode('utf-8')
|
raw = clean_ascii_chars(br.open(url).read().decode('utf-8'))
|
||||||
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False)
|
root = parse_html(raw)
|
||||||
for div in root.xpath('//div[@class="rg_meta notranslate"]'):
|
for div in root.xpath('//div[@class="rg_meta notranslate"]'):
|
||||||
try:
|
try:
|
||||||
data = json.loads(div.text)
|
data = json.loads(div.text)
|
||||||
|
@ -14,7 +14,6 @@ from urlparse import parse_qs
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
import html5lib
|
|
||||||
from calibre import browser as _browser, prints, random_user_agent
|
from calibre import browser as _browser, prints, random_user_agent
|
||||||
from calibre.utils.monotonic import monotonic
|
from calibre.utils.monotonic import monotonic
|
||||||
from calibre.utils.random_ua import accept_header_for_ua
|
from calibre.utils.random_ua import accept_header_for_ua
|
||||||
@ -48,7 +47,14 @@ def encode_query(**query):
|
|||||||
|
|
||||||
|
|
||||||
def parse_html(raw):
|
def parse_html(raw):
|
||||||
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
try:
|
||||||
|
from html5_parser import parse
|
||||||
|
except ImportError:
|
||||||
|
# Old versions of calibre
|
||||||
|
import html5lib
|
||||||
|
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
else:
|
||||||
|
return parse(raw)
|
||||||
|
|
||||||
|
|
||||||
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
|
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user