mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Get rid of cssselect from the Edelweiss metadata download plugin
This commit is contained in:
parent
88dbbefa7b
commit
deca8d35e5
@ -24,11 +24,6 @@ def parse_html(raw):
|
|||||||
return html5lib.parse(raw, treebuilder='lxml',
|
return html5lib.parse(raw, treebuilder='lxml',
|
||||||
namespaceHTMLElements=False).getroot()
|
namespaceHTMLElements=False).getroot()
|
||||||
|
|
||||||
def CSSSelect(expr):
|
|
||||||
from cssselect import HTMLTranslator
|
|
||||||
from lxml.etree import XPath
|
|
||||||
return XPath(HTMLTranslator().css_to_xpath(expr))
|
|
||||||
|
|
||||||
def astext(node):
|
def astext(node):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
return etree.tostring(node, method='text', encoding=unicode,
|
return etree.tostring(node, method='text', encoding=unicode,
|
||||||
@ -61,8 +56,10 @@ class Worker(Thread): # {{{
|
|||||||
def parse(self, raw):
|
def parse(self, raw):
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.utils.date import parse_only_date, UNDEFINED_DATE
|
from calibre.utils.date import parse_only_date, UNDEFINED_DATE
|
||||||
|
from css_selectors import Select
|
||||||
root = parse_html(raw)
|
root = parse_html(raw)
|
||||||
sku = CSSSelect('div.sku.attGroup')(root)[0]
|
selector = Select(root)
|
||||||
|
sku = next(selector('div.sku.attGroup'))
|
||||||
info = sku.getparent()
|
info = sku.getparent()
|
||||||
top = info.getparent().getparent()
|
top = info.getparent().getparent()
|
||||||
banner = top.find('div')
|
banner = top.find('div')
|
||||||
@ -87,20 +84,20 @@ class Worker(Thread): # {{{
|
|||||||
mi.set_identifier('edelweiss', self.sku)
|
mi.set_identifier('edelweiss', self.sku)
|
||||||
|
|
||||||
# Tags
|
# Tags
|
||||||
bisac = CSSSelect('div.bisac.attGroup')(root)
|
bisac = tuple(selector('div.bisac.attGroup'))
|
||||||
if bisac:
|
if bisac:
|
||||||
bisac = astext(bisac[0])
|
bisac = astext(bisac[0])
|
||||||
mi.tags = [x.strip() for x in bisac.split(',')]
|
mi.tags = [x.strip() for x in bisac.split(',')]
|
||||||
mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]
|
mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]
|
||||||
|
|
||||||
# Publisher
|
# Publisher
|
||||||
pub = CSSSelect('div.supplier.attGroup')(root)
|
pub = tuple(selector('div.supplier.attGroup'))
|
||||||
if pub:
|
if pub:
|
||||||
pub = astext(pub[0])
|
pub = astext(pub[0])
|
||||||
mi.publisher = pub
|
mi.publisher = pub
|
||||||
|
|
||||||
# Pubdate
|
# Pubdate
|
||||||
pub = CSSSelect('div.shipDate.attGroupItem')(root)
|
pub = tuple(selector('div.shipDate.attGroupItem'))
|
||||||
if pub:
|
if pub:
|
||||||
pub = astext(pub[0])
|
pub = astext(pub[0])
|
||||||
parts = pub.partition(':')[0::2]
|
parts = pub.partition(':')[0::2]
|
||||||
@ -116,22 +113,22 @@ class Worker(Thread): # {{{
|
|||||||
|
|
||||||
# Comments
|
# Comments
|
||||||
comm = ''
|
comm = ''
|
||||||
general = CSSSelect('div#pd-general-overview-content')(root)
|
general = tuple(selector('div#pd-general-overview-content'))
|
||||||
if general:
|
if general:
|
||||||
q = self.render_comments(general[0])
|
q = self.render_comments(general[0])
|
||||||
if q != '<p>No title summary available. </p>':
|
if q != '<p>No title summary available. </p>':
|
||||||
comm += q
|
comm += q
|
||||||
general = CSSSelect('div#pd-general-contributor-content')(root)
|
general = tuple(selector('div#pd-general-contributor-content'))
|
||||||
if general:
|
if general:
|
||||||
comm += self.render_comments(general[0])
|
comm += self.render_comments(general[0])
|
||||||
general = CSSSelect('div#pd-general-quotes-content')(root)
|
general = tuple(selector('div#pd-general-quotes-content'))
|
||||||
if general:
|
if general:
|
||||||
comm += self.render_comments(general[0])
|
comm += self.render_comments(general[0])
|
||||||
if comm:
|
if comm:
|
||||||
mi.comments = comm
|
mi.comments = comm
|
||||||
|
|
||||||
# Cover
|
# Cover
|
||||||
img = CSSSelect('img.title-image[src]')(root)
|
img = tuple(selector('img.title-image[src]'))
|
||||||
if img:
|
if img:
|
||||||
href = img[0].get('src').replace('jacket_covers/medium/',
|
href = img[0].get('src').replace('jacket_covers/medium/',
|
||||||
'jacket_covers/flyout/')
|
'jacket_covers/flyout/')
|
||||||
@ -252,11 +249,12 @@ class Edelweiss(Source):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception('Failed to parse identify results')
|
log.exception('Failed to parse identify results')
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
from css_selectors import Select
|
||||||
|
select = Select(root)
|
||||||
has_isbn = check_isbn(identifiers.get('isbn', None)) is not None
|
has_isbn = check_isbn(identifiers.get('isbn', None)) is not None
|
||||||
if not has_isbn:
|
if not has_isbn:
|
||||||
author_tokens = set(x.lower() for x in self.get_author_tokens(authors, only_first_author=True))
|
author_tokens = set(x.lower() for x in self.get_author_tokens(authors, only_first_author=True))
|
||||||
for entry in CSSSelect('div.listRow div.listRowMain')(root):
|
for entry in select('div.listRow div.listRowMain'):
|
||||||
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]')
|
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]')
|
||||||
if not a:
|
if not a:
|
||||||
continue
|
continue
|
||||||
@ -265,7 +263,7 @@ class Edelweiss(Source):
|
|||||||
sku = parse_qs(qs).get('sku', None)
|
sku = parse_qs(qs).get('sku', None)
|
||||||
if sku and sku[0]:
|
if sku and sku[0]:
|
||||||
sku = sku[0]
|
sku = sku[0]
|
||||||
div = CSSSelect('div.sku.attGroup')(entry)
|
div = tuple(select('div.sku.attGroup'))
|
||||||
if div:
|
if div:
|
||||||
text = astext(div[0])
|
text = astext(div[0])
|
||||||
isbns = [check_isbn(x.strip()) for x in text.split(',')]
|
isbns = [check_isbn(x.strip()) for x in text.split(',')]
|
||||||
@ -275,14 +273,14 @@ class Edelweiss(Source):
|
|||||||
for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'):
|
for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'):
|
||||||
self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/'))
|
self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/'))
|
||||||
|
|
||||||
div = CSSSelect('div.format.attGroup')(entry)
|
div = tuple(select('div.format.attGroup'))
|
||||||
text = astext(div[0]).lower()
|
text = astext(div[0]).lower()
|
||||||
if 'audio' in text or 'mp3' in text: # Audio-book, ignore
|
if 'audio' in text or 'mp3' in text: # Audio-book, ignore
|
||||||
continue
|
continue
|
||||||
if not has_isbn:
|
if not has_isbn:
|
||||||
# edelweiss returns matches based only on title, so we
|
# edelweiss returns matches based only on title, so we
|
||||||
# filter by author manually
|
# filter by author manually
|
||||||
div = CSSSelect('div.contributor.attGroup')(entry)
|
div = tuple(select('div.contributor.attGroup'))
|
||||||
try:
|
try:
|
||||||
entry_authors = set(self.get_author_tokens([x.strip() for x in astext(div[0]).lower().split(',')]))
|
entry_authors = set(self.get_author_tokens([x.strip() for x in astext(div[0]).lower().split(',')]))
|
||||||
except IndexError:
|
except IndexError:
|
||||||
@ -389,7 +387,3 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
tests = tests[start:stop]
|
tests = tests[start:stop]
|
||||||
test_identify_plugin(Edelweiss.name, tests)
|
test_identify_plugin(Edelweiss.name, tests)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user