mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
...
This commit is contained in:
parent
65a2931f68
commit
9076fe4a13
@ -1,6 +1,5 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||||
from xml.etree.ElementTree import _Element
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
|
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
|
||||||
@ -12,10 +11,8 @@ import datetime
|
|||||||
from urllib import quote_plus
|
from urllib import quote_plus
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
from lxml.etree import ElementBase
|
|
||||||
from calibre import as_unicode
|
from calibre import as_unicode
|
||||||
|
|
||||||
from calibre import prints
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
@ -27,16 +24,16 @@ class Ozon(Source):
|
|||||||
description = _('Downloads metadata and covers from OZON.ru')
|
description = _('Downloads metadata and covers from OZON.ru')
|
||||||
|
|
||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
|
|
||||||
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
||||||
'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
|
'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
|
||||||
# Test purpose only, test function does not like when sometimes some filed are empty
|
# Test purpose only, test function does not like when sometimes some filed are empty
|
||||||
#touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
#touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
||||||
# 'publisher', 'pubdate', 'comments'])
|
# 'publisher', 'pubdate', 'comments'])
|
||||||
|
|
||||||
supports_gzip_transfer_encoding = True
|
supports_gzip_transfer_encoding = True
|
||||||
has_html_comments = True
|
has_html_comments = True
|
||||||
|
|
||||||
ozon_url = 'http://www.ozon.ru'
|
ozon_url = 'http://www.ozon.ru'
|
||||||
|
|
||||||
# match any ISBN10/13. From "Regular Expressions Cookbook"
|
# match any ISBN10/13. From "Regular Expressions Cookbook"
|
||||||
@ -53,11 +50,11 @@ class Ozon(Source):
|
|||||||
res = ('ozon', ozon_id, url)
|
res = ('ozon', ozon_id, url)
|
||||||
return res
|
return res
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
# div_book -> search only books, ebooks and audio books
|
# div_book -> search only books, ebooks and audio books
|
||||||
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
||||||
|
|
||||||
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
||||||
# TODO: format isbn!
|
# TODO: format isbn!
|
||||||
qItems = set([isbn, title])
|
qItems = set([isbn, title])
|
||||||
@ -66,7 +63,7 @@ class Ozon(Source):
|
|||||||
qItems.discard(None)
|
qItems.discard(None)
|
||||||
qItems.discard('')
|
qItems.discard('')
|
||||||
qItems = map(_quoteString, qItems)
|
qItems = map(_quoteString, qItems)
|
||||||
|
|
||||||
q = ' '.join(qItems).strip()
|
q = ' '.join(qItems).strip()
|
||||||
log.info(u'search string: ' + q)
|
log.info(u'search string: ' + q)
|
||||||
|
|
||||||
@ -74,10 +71,10 @@ class Ozon(Source):
|
|||||||
q = q.encode('utf-8')
|
q = q.encode('utf-8')
|
||||||
if not q:
|
if not q:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
search_url += quote_plus(q)
|
search_url += quote_plus(q)
|
||||||
log.debug(u'search url: %r'%search_url)
|
log.debug(u'search url: %r'%search_url)
|
||||||
|
|
||||||
return search_url
|
return search_url
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -93,11 +90,11 @@ class Ozon(Source):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
raw = self.browser.open_novisit(query).read()
|
raw = self.browser.open_novisit(query).read()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception(u'Failed to make identify query: %r'%query)
|
log.exception(u'Failed to make identify query: %r'%query)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
|
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
|
||||||
@ -110,14 +107,14 @@ class Ozon(Source):
|
|||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
||||||
title = unicode(title).upper() if title else ''
|
title = unicode(title).upper() if title else ''
|
||||||
authors = map(unicode.upper, map(unicode, authors)) if authors else None
|
authors = map(unicode.upper, map(unicode, authors)) if authors else None
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
|
|
||||||
unk = unicode(_('Unknown')).upper()
|
unk = unicode(_('Unknown')).upper()
|
||||||
|
|
||||||
if title == unk:
|
if title == unk:
|
||||||
title = None
|
title = None
|
||||||
|
|
||||||
@ -129,7 +126,7 @@ class Ozon(Source):
|
|||||||
for miauthor in miauthors:
|
for miauthor in miauthors:
|
||||||
if author in miauthor: return True
|
if author in miauthor: return True
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def ensure_metadata_match(mi): # {{{
|
def ensure_metadata_match(mi): # {{{
|
||||||
match = True
|
match = True
|
||||||
if title:
|
if title:
|
||||||
@ -138,13 +135,13 @@ class Ozon(Source):
|
|||||||
if match and authors:
|
if match and authors:
|
||||||
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
||||||
match = in_authors(authors, miauthors)
|
match = in_authors(authors, miauthors)
|
||||||
|
|
||||||
if match and ozon_id:
|
if match and ozon_id:
|
||||||
mozon_id = mi.identifiers['ozon']
|
mozon_id = mi.identifiers['ozon']
|
||||||
match = ozon_id == mozon_id
|
match = ozon_id == mozon_id
|
||||||
|
|
||||||
return match
|
return match
|
||||||
|
|
||||||
metadata = []
|
metadata = []
|
||||||
for i, entry in enumerate(entries):
|
for i, entry in enumerate(entries):
|
||||||
mi = self.to_metadata(log, entry)
|
mi = self.to_metadata(log, entry)
|
||||||
@ -159,64 +156,64 @@ class Ozon(Source):
|
|||||||
|
|
||||||
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{
|
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{
|
||||||
req_isbn = identifiers.get('isbn', None)
|
req_isbn = identifiers.get('isbn', None)
|
||||||
|
|
||||||
for mi in metadata:
|
for mi in metadata:
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
ozon_id = mi.identifiers['ozon']
|
ozon_id = mi.identifiers['ozon']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.get_book_details(log, mi, timeout)
|
self.get_book_details(log, mi, timeout)
|
||||||
except:
|
except:
|
||||||
log.exception(u'Failed to get details for metadata: %s'%mi.title)
|
log.exception(u'Failed to get details for metadata: %s'%mi.title)
|
||||||
|
|
||||||
all_isbns = getattr(mi, 'all_isbns', [])
|
all_isbns = getattr(mi, 'all_isbns', [])
|
||||||
if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
|
if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
|
||||||
log.debug(u'skipped, no requested ISBN %s found'%req_isbn)
|
log.debug(u'skipped, no requested ISBN %s found'%req_isbn)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for isbn in all_isbns:
|
for isbn in all_isbns:
|
||||||
self.cache_isbn_to_identifier(isbn, ozon_id)
|
self.cache_isbn_to_identifier(isbn, ozon_id)
|
||||||
|
|
||||||
if mi.ozon_cover_url:
|
if mi.ozon_cover_url:
|
||||||
self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url)
|
self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url)
|
||||||
|
|
||||||
self.clean_downloaded_metadata(mi)
|
self.clean_downloaded_metadata(mi)
|
||||||
result_queue.put(mi)
|
result_queue.put(mi)
|
||||||
except:
|
except:
|
||||||
log.exception(u'Failed to get details for metadata: %s'%mi.title)
|
log.exception(u'Failed to get details for metadata: %s'%mi.title)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def to_metadata(self, log, entry): # {{{
|
def to_metadata(self, log, entry): # {{{
|
||||||
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
|
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
|
||||||
|
|
||||||
title = entry.xpath(xp_template.format('Name'))
|
title = entry.xpath(xp_template.format('Name'))
|
||||||
author = entry.xpath(xp_template.format('Author'))
|
author = entry.xpath(xp_template.format('Author'))
|
||||||
mi = Metadata(title, author.split(','))
|
mi = Metadata(title, author.split(','))
|
||||||
|
|
||||||
ozon_id = entry.xpath(xp_template.format('ID'))
|
ozon_id = entry.xpath(xp_template.format('ID'))
|
||||||
mi.identifiers = {'ozon':ozon_id}
|
mi.identifiers = {'ozon':ozon_id}
|
||||||
|
|
||||||
mi.comments = entry.xpath(xp_template.format('Annotation'))
|
mi.comments = entry.xpath(xp_template.format('Annotation'))
|
||||||
|
|
||||||
mi.ozon_cover_url = None
|
mi.ozon_cover_url = None
|
||||||
cover = entry.xpath(xp_template.format('Picture'))
|
cover = entry.xpath(xp_template.format('Picture'))
|
||||||
if cover:
|
if cover:
|
||||||
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
||||||
|
|
||||||
rating = entry.xpath(xp_template.format('ClientRatingValue'))
|
rating = entry.xpath(xp_template.format('ClientRatingValue'))
|
||||||
if rating:
|
if rating:
|
||||||
try:
|
try:
|
||||||
#'rating', A floating point number between 0 and 10
|
#'rating', A floating point number between 0 and 10
|
||||||
# OZON raion N of 5, calibre of 10, but there is a bug? in identify
|
# OZON raion N of 5, calibre of 10, but there is a bug? in identify
|
||||||
mi.rating = float(rating)
|
mi.rating = float(rating)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
rating
|
rating
|
||||||
return mi
|
return mi
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_cached_cover_url(self, identifiers): # {{{
|
def get_cached_cover_url(self, identifiers): # {{{
|
||||||
url = None
|
url = None
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
@ -248,14 +245,14 @@ class Ozon(Source):
|
|||||||
cached_url = self.get_cached_cover_url(mi.identifiers)
|
cached_url = self.get_cached_cover_url(mi.identifiers)
|
||||||
if cached_url is not None:
|
if cached_url is not None:
|
||||||
break
|
break
|
||||||
|
|
||||||
if cached_url is None:
|
if cached_url is None:
|
||||||
log.info('No cover found')
|
log.info('No cover found')
|
||||||
return
|
return
|
||||||
|
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
return
|
return
|
||||||
|
|
||||||
log.debug('Downloading cover from:', cached_url)
|
log.debug('Downloading cover from:', cached_url)
|
||||||
try:
|
try:
|
||||||
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
|
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
|
||||||
@ -265,10 +262,10 @@ class Ozon(Source):
|
|||||||
log.exception(u'Failed to download cover from: %s'%cached_url)
|
log.exception(u'Failed to download cover from: %s'%cached_url)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_book_details(self, log, metadata, timeout): # {{{
|
def get_book_details(self, log, metadata, timeout): # {{{
|
||||||
url = self.get_book_url(metadata.get_identifiers())[2]
|
url = self.get_book_url(metadata.get_identifiers())[2]
|
||||||
|
|
||||||
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
||||||
doc = html.fromstring(raw)
|
doc = html.fromstring(raw)
|
||||||
|
|
||||||
@ -298,14 +295,14 @@ class Ozon(Source):
|
|||||||
if matcher:
|
if matcher:
|
||||||
year = int(matcher.group(0))
|
year = int(matcher.group(0))
|
||||||
# only year is available, so use 1-st of Jan
|
# only year is available, so use 1-st of Jan
|
||||||
metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
|
metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
|
||||||
#metadata.pubdate = datetime(year, 1, 1)
|
#metadata.pubdate = datetime(year, 1, 1)
|
||||||
xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
|
xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
|
||||||
displLang = publishers[0].xpath(xpt)
|
displLang = publishers[0].xpath(xpt)
|
||||||
lang_code =_translageLanguageToCode(displLang)
|
lang_code =_translageLanguageToCode(displLang)
|
||||||
if lang_code:
|
if lang_code:
|
||||||
metadata.language = lang_code
|
metadata.language = lang_code
|
||||||
|
|
||||||
# overwrite comments from HTML if any
|
# overwrite comments from HTML if any
|
||||||
# tr/td[contains(.//text(), "От издателя")] -> does not work, why?
|
# tr/td[contains(.//text(), "От издателя")] -> does not work, why?
|
||||||
xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
|
xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
|
||||||
@ -323,14 +320,14 @@ class Ozon(Source):
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def _quoteString(str): # {{{
|
def _quoteString(str): # {{{
|
||||||
return '"' + str + '"' if str and str.find(' ') != -1 else str
|
return '"' + str + '"' if str and str.find(' ') != -1 else str
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# TODO: make customizable
|
# TODO: make customizable
|
||||||
def _translateToBigCoverUrl(coverUrl): # {{{
|
def _translateToBigCoverUrl(coverUrl): # {{{
|
||||||
# http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
|
# http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
|
||||||
# http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
|
# http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
|
||||||
|
|
||||||
m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
|
m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
|
||||||
if m:
|
if m:
|
||||||
coverUrl = m.group(1) + m.group(2) + 'jpg'
|
coverUrl = m.group(1) + m.group(2) + 'jpg'
|
||||||
@ -339,12 +336,12 @@ def _translateToBigCoverUrl(coverUrl): # {{{
|
|||||||
|
|
||||||
def _get_affiliateId(): # {{{
|
def _get_affiliateId(): # {{{
|
||||||
import random
|
import random
|
||||||
|
|
||||||
aff_id = 'romuk'
|
aff_id = 'romuk'
|
||||||
# Use Kovid's affiliate id 30% of the time.
|
# Use Kovid's affiliate id 30% of the time.
|
||||||
if random.randint(1, 10) in (1, 2, 3):
|
if random.randint(1, 10) in (1, 2, 3):
|
||||||
aff_id = 'kovidgoyal'
|
aff_id = 'kovidgoyal'
|
||||||
return aff_id
|
return aff_id
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# for now only RUS ISBN are supported
|
# for now only RUS ISBN are supported
|
||||||
@ -387,10 +384,10 @@ def _format_isbn(log, isbn): # {{{
|
|||||||
def _translageLanguageToCode(displayLang): # {{{
|
def _translageLanguageToCode(displayLang): # {{{
|
||||||
displayLang = unicode(displayLang).strip() if displayLang else None
|
displayLang = unicode(displayLang).strip() if displayLang else None
|
||||||
langTbl = { None: 'ru',
|
langTbl = { None: 'ru',
|
||||||
u'Немецкий': 'de',
|
u'Немецкий': 'de',
|
||||||
u'Английский': 'en',
|
u'Английский': 'en',
|
||||||
u'Французский': 'fr',
|
u'Французский': 'fr',
|
||||||
u'Итальянский': 'it',
|
u'Итальянский': 'it',
|
||||||
u'Испанский': 'es',
|
u'Испанский': 'es',
|
||||||
u'Китайский': 'zh',
|
u'Китайский': 'zh',
|
||||||
u'Японский': 'ja' }
|
u'Японский': 'ja' }
|
||||||
@ -406,7 +403,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
test_identify_plugin(Ozon.name,
|
test_identify_plugin(Ozon.name,
|
||||||
[
|
[
|
||||||
|
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '9785916572629'} },
|
{'identifiers':{'isbn': '9785916572629'} },
|
||||||
[title_test(u'На все четыре стороны', exact=True),
|
[title_test(u'На все четыре стороны', exact=True),
|
||||||
@ -442,4 +439,4 @@ if __name__ == '__main__': # tests {{{
|
|||||||
[title_test(u'Метро', exact=False)]
|
[title_test(u'Метро', exact=False)]
|
||||||
),
|
),
|
||||||
])
|
])
|
||||||
# }}}
|
# }}}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user