Get rid of cssselect from the Edelweiss metadata download plugin

This commit is contained in:
Kovid Goyal 2015-02-20 18:37:04 +05:30
parent 88dbbefa7b
commit deca8d35e5

View File

@ -24,11 +24,6 @@ def parse_html(raw):
return html5lib.parse(raw, treebuilder='lxml', return html5lib.parse(raw, treebuilder='lxml',
namespaceHTMLElements=False).getroot() namespaceHTMLElements=False).getroot()
def CSSSelect(expr):
from cssselect import HTMLTranslator
from lxml.etree import XPath
return XPath(HTMLTranslator().css_to_xpath(expr))
def astext(node): def astext(node):
from lxml import etree from lxml import etree
return etree.tostring(node, method='text', encoding=unicode, return etree.tostring(node, method='text', encoding=unicode,
@ -61,8 +56,10 @@ class Worker(Thread): # {{{
def parse(self, raw): def parse(self, raw):
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import parse_only_date, UNDEFINED_DATE from calibre.utils.date import parse_only_date, UNDEFINED_DATE
from css_selectors import Select
root = parse_html(raw) root = parse_html(raw)
sku = CSSSelect('div.sku.attGroup')(root)[0] selector = Select(root)
sku = next(selector('div.sku.attGroup'))
info = sku.getparent() info = sku.getparent()
top = info.getparent().getparent() top = info.getparent().getparent()
banner = top.find('div') banner = top.find('div')
@ -87,20 +84,20 @@ class Worker(Thread): # {{{
mi.set_identifier('edelweiss', self.sku) mi.set_identifier('edelweiss', self.sku)
# Tags # Tags
bisac = CSSSelect('div.bisac.attGroup')(root) bisac = tuple(selector('div.bisac.attGroup'))
if bisac: if bisac:
bisac = astext(bisac[0]) bisac = astext(bisac[0])
mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [x.strip() for x in bisac.split(',')]
mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]
# Publisher # Publisher
pub = CSSSelect('div.supplier.attGroup')(root) pub = tuple(selector('div.supplier.attGroup'))
if pub: if pub:
pub = astext(pub[0]) pub = astext(pub[0])
mi.publisher = pub mi.publisher = pub
# Pubdate # Pubdate
pub = CSSSelect('div.shipDate.attGroupItem')(root) pub = tuple(selector('div.shipDate.attGroupItem'))
if pub: if pub:
pub = astext(pub[0]) pub = astext(pub[0])
parts = pub.partition(':')[0::2] parts = pub.partition(':')[0::2]
@ -116,22 +113,22 @@ class Worker(Thread): # {{{
# Comments # Comments
comm = '' comm = ''
general = CSSSelect('div#pd-general-overview-content')(root) general = tuple(selector('div#pd-general-overview-content'))
if general: if general:
q = self.render_comments(general[0]) q = self.render_comments(general[0])
if q != '<p>No title summary available. </p>': if q != '<p>No title summary available. </p>':
comm += q comm += q
general = CSSSelect('div#pd-general-contributor-content')(root) general = tuple(selector('div#pd-general-contributor-content'))
if general: if general:
comm += self.render_comments(general[0]) comm += self.render_comments(general[0])
general = CSSSelect('div#pd-general-quotes-content')(root) general = tuple(selector('div#pd-general-quotes-content'))
if general: if general:
comm += self.render_comments(general[0]) comm += self.render_comments(general[0])
if comm: if comm:
mi.comments = comm mi.comments = comm
# Cover # Cover
img = CSSSelect('img.title-image[src]')(root) img = tuple(selector('img.title-image[src]'))
if img: if img:
href = img[0].get('src').replace('jacket_covers/medium/', href = img[0].get('src').replace('jacket_covers/medium/',
'jacket_covers/flyout/') 'jacket_covers/flyout/')
@ -252,11 +249,12 @@ class Edelweiss(Source):
except Exception as e: except Exception as e:
log.exception('Failed to parse identify results') log.exception('Failed to parse identify results')
return as_unicode(e) return as_unicode(e)
from css_selectors import Select
select = Select(root)
has_isbn = check_isbn(identifiers.get('isbn', None)) is not None has_isbn = check_isbn(identifiers.get('isbn', None)) is not None
if not has_isbn: if not has_isbn:
author_tokens = set(x.lower() for x in self.get_author_tokens(authors, only_first_author=True)) author_tokens = set(x.lower() for x in self.get_author_tokens(authors, only_first_author=True))
for entry in CSSSelect('div.listRow div.listRowMain')(root): for entry in select('div.listRow div.listRowMain'):
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]') a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]')
if not a: if not a:
continue continue
@ -265,7 +263,7 @@ class Edelweiss(Source):
sku = parse_qs(qs).get('sku', None) sku = parse_qs(qs).get('sku', None)
if sku and sku[0]: if sku and sku[0]:
sku = sku[0] sku = sku[0]
div = CSSSelect('div.sku.attGroup')(entry) div = tuple(select('div.sku.attGroup'))
if div: if div:
text = astext(div[0]) text = astext(div[0])
isbns = [check_isbn(x.strip()) for x in text.split(',')] isbns = [check_isbn(x.strip()) for x in text.split(',')]
@ -275,14 +273,14 @@ class Edelweiss(Source):
for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'): for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'):
self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/')) self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/'))
div = CSSSelect('div.format.attGroup')(entry) div = tuple(select('div.format.attGroup'))
text = astext(div[0]).lower() text = astext(div[0]).lower()
if 'audio' in text or 'mp3' in text: # Audio-book, ignore if 'audio' in text or 'mp3' in text: # Audio-book, ignore
continue continue
if not has_isbn: if not has_isbn:
# edelweiss returns matches based only on title, so we # edelweiss returns matches based only on title, so we
# filter by author manually # filter by author manually
div = CSSSelect('div.contributor.attGroup')(entry) div = tuple(select('div.contributor.attGroup'))
try: try:
entry_authors = set(self.get_author_tokens([x.strip() for x in astext(div[0]).lower().split(',')])) entry_authors = set(self.get_author_tokens([x.strip() for x in astext(div[0]).lower().split(',')]))
except IndexError: except IndexError:
@ -389,7 +387,3 @@ if __name__ == '__main__':
tests = tests[start:stop] tests = tests[start:stop]
test_identify_plugin(Edelweiss.name, tests) test_identify_plugin(Edelweiss.name, tests)