This commit is contained in:
Kovid Goyal 2011-08-18 16:55:35 -06:00
parent 65a2931f68
commit 9076fe4a13

View File

@ -1,6 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
from xml.etree.ElementTree import _Element
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>' __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
@ -12,10 +11,8 @@ import datetime
from urllib import quote_plus from urllib import quote_plus
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml import etree, html from lxml import etree, html
from lxml.etree import ElementBase
from calibre import as_unicode from calibre import as_unicode
from calibre import prints
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
@ -27,16 +24,16 @@ class Ozon(Source):
description = _('Downloads metadata and covers from OZON.ru') description = _('Downloads metadata and covers from OZON.ru')
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
# Test purpose only, test function does not like when sometimes some filed are empty # Test purpose only, test function does not like when sometimes some filed are empty
#touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
# 'publisher', 'pubdate', 'comments']) # 'publisher', 'pubdate', 'comments'])
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
has_html_comments = True has_html_comments = True
ozon_url = 'http://www.ozon.ru' ozon_url = 'http://www.ozon.ru'
# match any ISBN10/13. From "Regular Expressions Cookbook" # match any ISBN10/13. From "Regular Expressions Cookbook"
@ -53,11 +50,11 @@ class Ozon(Source):
res = ('ozon', ozon_id, url) res = ('ozon', ozon_id, url)
return res return res
# }}} # }}}
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
# div_book -> search only books, ebooks and audio books # div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
isbn = _format_isbn(log, identifiers.get('isbn', None)) isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn! # TODO: format isbn!
qItems = set([isbn, title]) qItems = set([isbn, title])
@ -66,7 +63,7 @@ class Ozon(Source):
qItems.discard(None) qItems.discard(None)
qItems.discard('') qItems.discard('')
qItems = map(_quoteString, qItems) qItems = map(_quoteString, qItems)
q = ' '.join(qItems).strip() q = ' '.join(qItems).strip()
log.info(u'search string: ' + q) log.info(u'search string: ' + q)
@ -74,10 +71,10 @@ class Ozon(Source):
q = q.encode('utf-8') q = q.encode('utf-8')
if not q: if not q:
return None return None
search_url += quote_plus(q) search_url += quote_plus(q)
log.debug(u'search url: %r'%search_url) log.debug(u'search url: %r'%search_url)
return search_url return search_url
# }}} # }}}
@ -93,11 +90,11 @@ class Ozon(Source):
try: try:
raw = self.browser.open_novisit(query).read() raw = self.browser.open_novisit(query).read()
except Exception as e: except Exception as e:
log.exception(u'Failed to make identify query: %r'%query) log.exception(u'Failed to make identify query: %r'%query)
return as_unicode(e) return as_unicode(e)
try: try:
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
@ -110,14 +107,14 @@ class Ozon(Source):
return as_unicode(e) return as_unicode(e)
# }}} # }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
title = unicode(title).upper() if title else '' title = unicode(title).upper() if title else ''
authors = map(unicode.upper, map(unicode, authors)) if authors else None authors = map(unicode.upper, map(unicode, authors)) if authors else None
ozon_id = identifiers.get('ozon', None) ozon_id = identifiers.get('ozon', None)
unk = unicode(_('Unknown')).upper() unk = unicode(_('Unknown')).upper()
if title == unk: if title == unk:
title = None title = None
@ -129,7 +126,7 @@ class Ozon(Source):
for miauthor in miauthors: for miauthor in miauthors:
if author in miauthor: return True if author in miauthor: return True
return None return None
def ensure_metadata_match(mi): # {{{ def ensure_metadata_match(mi): # {{{
match = True match = True
if title: if title:
@ -138,13 +135,13 @@ class Ozon(Source):
if match and authors: if match and authors:
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
match = in_authors(authors, miauthors) match = in_authors(authors, miauthors)
if match and ozon_id: if match and ozon_id:
mozon_id = mi.identifiers['ozon'] mozon_id = mi.identifiers['ozon']
match = ozon_id == mozon_id match = ozon_id == mozon_id
return match return match
metadata = [] metadata = []
for i, entry in enumerate(entries): for i, entry in enumerate(entries):
mi = self.to_metadata(log, entry) mi = self.to_metadata(log, entry)
@ -159,64 +156,64 @@ class Ozon(Source):
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{ def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{
req_isbn = identifiers.get('isbn', None) req_isbn = identifiers.get('isbn', None)
for mi in metadata: for mi in metadata:
if abort.is_set(): if abort.is_set():
break break
try: try:
ozon_id = mi.identifiers['ozon'] ozon_id = mi.identifiers['ozon']
try: try:
self.get_book_details(log, mi, timeout) self.get_book_details(log, mi, timeout)
except: except:
log.exception(u'Failed to get details for metadata: %s'%mi.title) log.exception(u'Failed to get details for metadata: %s'%mi.title)
all_isbns = getattr(mi, 'all_isbns', []) all_isbns = getattr(mi, 'all_isbns', [])
if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns: if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
log.debug(u'skipped, no requested ISBN %s found'%req_isbn) log.debug(u'skipped, no requested ISBN %s found'%req_isbn)
continue continue
for isbn in all_isbns: for isbn in all_isbns:
self.cache_isbn_to_identifier(isbn, ozon_id) self.cache_isbn_to_identifier(isbn, ozon_id)
if mi.ozon_cover_url: if mi.ozon_cover_url:
self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url)
self.clean_downloaded_metadata(mi) self.clean_downloaded_metadata(mi)
result_queue.put(mi) result_queue.put(mi)
except: except:
log.exception(u'Failed to get details for metadata: %s'%mi.title) log.exception(u'Failed to get details for metadata: %s'%mi.title)
# }}} # }}}
def to_metadata(self, log, entry): # {{{ def to_metadata(self, log, entry): # {{{
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
title = entry.xpath(xp_template.format('Name')) title = entry.xpath(xp_template.format('Name'))
author = entry.xpath(xp_template.format('Author')) author = entry.xpath(xp_template.format('Author'))
mi = Metadata(title, author.split(',')) mi = Metadata(title, author.split(','))
ozon_id = entry.xpath(xp_template.format('ID')) ozon_id = entry.xpath(xp_template.format('ID'))
mi.identifiers = {'ozon':ozon_id} mi.identifiers = {'ozon':ozon_id}
mi.comments = entry.xpath(xp_template.format('Annotation')) mi.comments = entry.xpath(xp_template.format('Annotation'))
mi.ozon_cover_url = None mi.ozon_cover_url = None
cover = entry.xpath(xp_template.format('Picture')) cover = entry.xpath(xp_template.format('Picture'))
if cover: if cover:
mi.ozon_cover_url = _translateToBigCoverUrl(cover) mi.ozon_cover_url = _translateToBigCoverUrl(cover)
rating = entry.xpath(xp_template.format('ClientRatingValue')) rating = entry.xpath(xp_template.format('ClientRatingValue'))
if rating: if rating:
try: try:
#'rating', A floating point number between 0 and 10 #'rating', A floating point number between 0 and 10
# OZON raion N of 5, calibre of 10, but there is a bug? in identify # OZON raion N of 5, calibre of 10, but there is a bug? in identify
mi.rating = float(rating) mi.rating = float(rating)
except: except:
pass pass
rating rating
return mi return mi
# }}} # }}}
def get_cached_cover_url(self, identifiers): # {{{ def get_cached_cover_url(self, identifiers): # {{{
url = None url = None
ozon_id = identifiers.get('ozon', None) ozon_id = identifiers.get('ozon', None)
@ -248,14 +245,14 @@ class Ozon(Source):
cached_url = self.get_cached_cover_url(mi.identifiers) cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None: if cached_url is not None:
break break
if cached_url is None: if cached_url is None:
log.info('No cover found') log.info('No cover found')
return return
if abort.is_set(): if abort.is_set():
return return
log.debug('Downloading cover from:', cached_url) log.debug('Downloading cover from:', cached_url)
try: try:
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
@ -265,10 +262,10 @@ class Ozon(Source):
log.exception(u'Failed to download cover from: %s'%cached_url) log.exception(u'Failed to download cover from: %s'%cached_url)
return as_unicode(e) return as_unicode(e)
# }}} # }}}
def get_book_details(self, log, metadata, timeout): # {{{ def get_book_details(self, log, metadata, timeout): # {{{
url = self.get_book_url(metadata.get_identifiers())[2] url = self.get_book_url(metadata.get_identifiers())[2]
raw = self.browser.open_novisit(url, timeout=timeout).read() raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw) doc = html.fromstring(raw)
@ -298,14 +295,14 @@ class Ozon(Source):
if matcher: if matcher:
year = int(matcher.group(0)) year = int(matcher.group(0))
# only year is available, so use 1-st of Jan # only year is available, so use 1-st of Jan
metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
#metadata.pubdate = datetime(year, 1, 1) #metadata.pubdate = datetime(year, 1, 1)
xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")' xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
displLang = publishers[0].xpath(xpt) displLang = publishers[0].xpath(xpt)
lang_code =_translageLanguageToCode(displLang) lang_code =_translageLanguageToCode(displLang)
if lang_code: if lang_code:
metadata.language = lang_code metadata.language = lang_code
# overwrite comments from HTML if any # overwrite comments from HTML if any
# tr/td[contains(.//text(), "От издателя")] -> does not work, why? # tr/td[contains(.//text(), "От издателя")] -> does not work, why?
xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\ xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
@ -323,14 +320,14 @@ class Ozon(Source):
# }}} # }}}
def _quoteString(str): # {{{ def _quoteString(str): # {{{
return '"' + str + '"' if str and str.find(' ') != -1 else str return '"' + str + '"' if str and str.find(' ') != -1 else str
# }}} # }}}
# TODO: make customizable # TODO: make customizable
def _translateToBigCoverUrl(coverUrl): # {{{ def _translateToBigCoverUrl(coverUrl): # {{{
# http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif # http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
# http://www.ozon.ru/multimedia/books_covers/1002986468.jpg # http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl) m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
if m: if m:
coverUrl = m.group(1) + m.group(2) + 'jpg' coverUrl = m.group(1) + m.group(2) + 'jpg'
@ -339,12 +336,12 @@ def _translateToBigCoverUrl(coverUrl): # {{{
def _get_affiliateId(): # {{{ def _get_affiliateId(): # {{{
import random import random
aff_id = 'romuk' aff_id = 'romuk'
# Use Kovid's affiliate id 30% of the time. # Use Kovid's affiliate id 30% of the time.
if random.randint(1, 10) in (1, 2, 3): if random.randint(1, 10) in (1, 2, 3):
aff_id = 'kovidgoyal' aff_id = 'kovidgoyal'
return aff_id return aff_id
# }}} # }}}
# for now only RUS ISBN are supported # for now only RUS ISBN are supported
@ -387,10 +384,10 @@ def _format_isbn(log, isbn): # {{{
def _translageLanguageToCode(displayLang): # {{{ def _translageLanguageToCode(displayLang): # {{{
displayLang = unicode(displayLang).strip() if displayLang else None displayLang = unicode(displayLang).strip() if displayLang else None
langTbl = { None: 'ru', langTbl = { None: 'ru',
u'Немецкий': 'de', u'Немецкий': 'de',
u'Английский': 'en', u'Английский': 'en',
u'Французский': 'fr', u'Французский': 'fr',
u'Итальянский': 'it', u'Итальянский': 'it',
u'Испанский': 'es', u'Испанский': 'es',
u'Китайский': 'zh', u'Китайский': 'zh',
u'Японский': 'ja' } u'Японский': 'ja' }
@ -406,7 +403,7 @@ if __name__ == '__main__': # tests {{{
test_identify_plugin(Ozon.name, test_identify_plugin(Ozon.name,
[ [
( (
{'identifiers':{'isbn': '9785916572629'} }, {'identifiers':{'isbn': '9785916572629'} },
[title_test(u'На все четыре стороны', exact=True), [title_test(u'На все четыре стороны', exact=True),
@ -442,4 +439,4 @@ if __name__ == '__main__': # tests {{{
[title_test(u'Метро', exact=False)] [title_test(u'Метро', exact=False)]
), ),
]) ])
# }}} # }}}