mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix bug processing author names with initials when downloading metadata from ozon.ru. Fixes #845420 (Problems with processing metadata in plugin ozon.ru)
This commit is contained in:
commit
c8a78a83bc
@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Input plugin for HTML or OPF ebooks.
|
Input plugin for HTML or OPF ebooks.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, re, sys, uuid, tempfile, errno
|
import os, re, sys, uuid, tempfile
|
||||||
from urlparse import urlparse, urlunparse
|
from urlparse import urlparse, urlunparse
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
@ -116,7 +116,8 @@ def cap_author_token(token):
|
|||||||
lt = lower(token)
|
lt = lower(token)
|
||||||
if lt in ('von', 'de', 'el', 'van', 'le'):
|
if lt in ('von', 'de', 'el', 'van', 'le'):
|
||||||
return lt
|
return lt
|
||||||
if re.match(r'([a-z]\.){2,}$', lt) is not None:
|
# no digits no spez. characters
|
||||||
|
if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
|
||||||
# Normalize tokens of the form J.K. to J. K.
|
# Normalize tokens of the form J.K. to J. K.
|
||||||
parts = token.split('.')
|
parts = token.split('.')
|
||||||
return '. '.join(map(capitalize, parts)).strip()
|
return '. '.join(map(capitalize, parts)).strip()
|
||||||
|
@ -109,8 +109,16 @@ class Ozon(Source):
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
||||||
|
# some book titles have extra charactes like this
|
||||||
|
# TODO: make a twick
|
||||||
|
reRemoveFromTitle = None
|
||||||
|
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
||||||
|
|
||||||
title = unicode(title).upper() if title else ''
|
title = unicode(title).upper() if title else ''
|
||||||
authors = map(unicode.upper, map(unicode, authors)) if authors else None
|
if reRemoveFromTitle:
|
||||||
|
title = reRemoveFromTitle.sub('', title)
|
||||||
|
authors = map(_normalizeAuthorNameWithInitials,
|
||||||
|
map(unicode.upper, map(unicode, authors))) if authors else None
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
|
|
||||||
unk = unicode(_('Unknown')).upper()
|
unk = unicode(_('Unknown')).upper()
|
||||||
@ -124,6 +132,7 @@ class Ozon(Source):
|
|||||||
def in_authors(authors, miauthors):
|
def in_authors(authors, miauthors):
|
||||||
for author in authors:
|
for author in authors:
|
||||||
for miauthor in miauthors:
|
for miauthor in miauthors:
|
||||||
|
#log.debug(u'=> %s <> %s'%(author, miauthor))
|
||||||
if author in miauthor: return True
|
if author in miauthor: return True
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -131,7 +140,10 @@ class Ozon(Source):
|
|||||||
match = True
|
match = True
|
||||||
if title:
|
if title:
|
||||||
mititle = unicode(mi.title).upper() if mi.title else ''
|
mititle = unicode(mi.title).upper() if mi.title else ''
|
||||||
|
if reRemoveFromTitle:
|
||||||
|
mititle = reRemoveFromTitle.sub('', mititle)
|
||||||
match = title in mititle
|
match = title in mititle
|
||||||
|
#log.debug(u't=> %s <> %s'%(title, mititle))
|
||||||
if match and authors:
|
if match and authors:
|
||||||
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
||||||
match = in_authors(authors, miauthors)
|
match = in_authors(authors, miauthors)
|
||||||
@ -190,7 +202,8 @@ class Ozon(Source):
|
|||||||
|
|
||||||
title = entry.xpath(xp_template.format('Name'))
|
title = entry.xpath(xp_template.format('Name'))
|
||||||
author = entry.xpath(xp_template.format('Author'))
|
author = entry.xpath(xp_template.format('Author'))
|
||||||
mi = Metadata(title, author.split(','))
|
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
|
||||||
|
mi = Metadata(title, norm_authors)
|
||||||
|
|
||||||
ozon_id = entry.xpath(xp_template.format('ID'))
|
ozon_id = entry.xpath(xp_template.format('ID'))
|
||||||
mi.identifiers = {'ozon':ozon_id}
|
mi.identifiers = {'ozon':ozon_id}
|
||||||
@ -202,6 +215,11 @@ class Ozon(Source):
|
|||||||
if cover:
|
if cover:
|
||||||
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
||||||
|
|
||||||
|
pub_year = entry.xpath(xp_template.format('Year'))
|
||||||
|
if pub_year:
|
||||||
|
mi.pubdate = toPubdate(log, pub_year)
|
||||||
|
#log.debug('pubdate %s'%mi.pubdate)
|
||||||
|
|
||||||
rating = entry.xpath(xp_template.format('ClientRatingValue'))
|
rating = entry.xpath(xp_template.format('ClientRatingValue'))
|
||||||
if rating:
|
if rating:
|
||||||
try:
|
try:
|
||||||
@ -269,13 +287,17 @@ class Ozon(Source):
|
|||||||
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
||||||
doc = html.fromstring(raw)
|
doc = html.fromstring(raw)
|
||||||
|
|
||||||
|
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
|
||||||
|
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
|
||||||
|
|
||||||
# series
|
# series
|
||||||
xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)'
|
xpt = xpt_prod_det_at % u'Сери'
|
||||||
|
# % u'Серия:'
|
||||||
series = doc.xpath(xpt)
|
series = doc.xpath(xpt)
|
||||||
if series:
|
if series:
|
||||||
metadata.series = series
|
metadata.series = series
|
||||||
|
|
||||||
xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")'
|
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
|
||||||
isbn_str = doc.xpath(xpt)
|
isbn_str = doc.xpath(xpt)
|
||||||
if isbn_str:
|
if isbn_str:
|
||||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
|
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
|
||||||
@ -283,38 +305,42 @@ class Ozon(Source):
|
|||||||
metadata.all_isbns = all_isbns
|
metadata.all_isbns = all_isbns
|
||||||
metadata.isbn = all_isbns[0]
|
metadata.isbn = all_isbns[0]
|
||||||
|
|
||||||
xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]'
|
xpt = xpt_prod_det_at % u'Издатель'
|
||||||
publishers = doc.xpath(xpt)
|
publishers = doc.xpath(xpt)
|
||||||
if publishers:
|
if publishers:
|
||||||
metadata.publisher = publishers[0].text
|
metadata.publisher = publishers
|
||||||
|
|
||||||
xpt = u'string(../text()[contains(., "г.")])'
|
displ_lang = None
|
||||||
yearIn = publishers[0].xpath(xpt)
|
xpt = xpt_prod_det_tx % u'Язык'
|
||||||
|
langs = doc.xpath(xpt)
|
||||||
|
if langs:
|
||||||
|
lng_splt = langs.split(u',')
|
||||||
|
if lng_splt:
|
||||||
|
displ_lang = lng_splt[0].strip()
|
||||||
|
metadata.language = _translageLanguageToCode(displ_lang)
|
||||||
|
#log.debug(u'language: %s'%displ_lang)
|
||||||
|
|
||||||
|
# can be set before from xml search responce
|
||||||
|
if not metadata.pubdate:
|
||||||
|
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
|
||||||
|
yearIn = doc.xpath(xpt)
|
||||||
if yearIn:
|
if yearIn:
|
||||||
matcher = re.search(r'\d{4}', yearIn)
|
matcher = re.search(r'\d{4}', yearIn)
|
||||||
if matcher:
|
if matcher:
|
||||||
year = int(matcher.group(0))
|
metadata.pubdate = toPubdate(log, matcher.group(0))
|
||||||
# only year is available, so use 1-st of Jan
|
|
||||||
metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
|
|
||||||
#metadata.pubdate = datetime(year, 1, 1)
|
|
||||||
xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
|
|
||||||
displLang = publishers[0].xpath(xpt)
|
|
||||||
lang_code =_translageLanguageToCode(displLang)
|
|
||||||
if lang_code:
|
|
||||||
metadata.language = lang_code
|
|
||||||
|
|
||||||
# overwrite comments from HTML if any
|
# overwrite comments from HTML if any
|
||||||
# tr/td[contains(.//text(), "От издателя")] -> does not work, why?
|
xpt = u'//table[@id="detail_description"]//tr/td'
|
||||||
xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
|
|
||||||
u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]'
|
|
||||||
comment_elem = doc.xpath(xpt)
|
comment_elem = doc.xpath(xpt)
|
||||||
if comment_elem:
|
if comment_elem:
|
||||||
comments = unicode(etree.tostring(comment_elem[0]))
|
comments = unicode(etree.tostring(comment_elem[0]))
|
||||||
if comments:
|
if comments:
|
||||||
# cleanup root tag, TODO: remove tags like object/embeded
|
# cleanup root tag, TODO: remove tags like object/embeded
|
||||||
comments = re.sub(r'^<td.+?>|</td>.+?$', u'', comments).strip()
|
comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
|
||||||
if comments:
|
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
||||||
metadata.comments = comments
|
metadata.comments = comments
|
||||||
|
else:
|
||||||
|
log.debug('HTML book description skipped in favour of search service xml responce')
|
||||||
else:
|
else:
|
||||||
log.debug('No book description found in HTML')
|
log.debug('No book description found in HTML')
|
||||||
# }}}
|
# }}}
|
||||||
@ -390,10 +416,40 @@ def _translageLanguageToCode(displayLang): # {{{
|
|||||||
u'Итальянский': 'it',
|
u'Итальянский': 'it',
|
||||||
u'Испанский': 'es',
|
u'Испанский': 'es',
|
||||||
u'Китайский': 'zh',
|
u'Китайский': 'zh',
|
||||||
u'Японский': 'ja' }
|
u'Японский': 'ja',
|
||||||
|
u'Финский' : 'fi',
|
||||||
|
u'Польский' : 'pl',}
|
||||||
return langTbl.get(displayLang, None)
|
return langTbl.get(displayLang, None)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
|
||||||
|
def _normalizeAuthorNameWithInitials(name): # {{{
|
||||||
|
res = name
|
||||||
|
if name:
|
||||||
|
re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$'
|
||||||
|
re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
|
||||||
|
matcher = re.match(re1, unicode(name), re.UNICODE)
|
||||||
|
if not matcher:
|
||||||
|
matcher = re.match(re2, unicode(name), re.UNICODE)
|
||||||
|
|
||||||
|
if matcher:
|
||||||
|
d = matcher.groupdict()
|
||||||
|
res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
|
||||||
|
return res
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def toPubdate(log, yearAsString):
|
||||||
|
res = None
|
||||||
|
if yearAsString:
|
||||||
|
try:
|
||||||
|
year = int(yearAsString)
|
||||||
|
# only year is available, so use 1-st of Jan
|
||||||
|
res = datetime.datetime(year, 1, 1)
|
||||||
|
except:
|
||||||
|
log.error('cannot parse to date %s'%yearAsString)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
|
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
|
||||||
# comment some touched_fields before run thoses tests
|
# comment some touched_fields before run thoses tests
|
||||||
@ -403,7 +459,12 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
test_identify_plugin(Ozon.name,
|
test_identify_plugin(Ozon.name,
|
||||||
[
|
[
|
||||||
|
# (
|
||||||
|
# {'identifiers':{}, 'title':u'Норвежский язык: Практический курс',
|
||||||
|
# 'authors':[u'Колесников В.П.', u'Г.В. Шатков']},
|
||||||
|
# [title_test(u'Норвежский язык: Практический курс', exact=True),
|
||||||
|
# authors_test([u'В. П. Колесников', u'Г. В. Шатков'])]
|
||||||
|
# ),
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '9785916572629'} },
|
{'identifiers':{'isbn': '9785916572629'} },
|
||||||
[title_test(u'На все четыре стороны', exact=True),
|
[title_test(u'На все четыре стороны', exact=True),
|
||||||
|
@ -80,13 +80,15 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
|||||||
doc = html.fromstring(f.read())
|
doc = html.fromstring(f.read())
|
||||||
|
|
||||||
# example where we are going to find formats
|
# example where we are going to find formats
|
||||||
# <div class="box">
|
# <div class="l">
|
||||||
# ...
|
# <p>
|
||||||
# <b>Доступные форматы:</b>
|
# Доступно:
|
||||||
# <div class="vertpadd">.epub, .fb2, .pdf, .pdf, .txt</div>
|
# </p>
|
||||||
# ...
|
|
||||||
# </div>
|
# </div>
|
||||||
xpt = u'normalize-space(//div[@class="box"]//*[contains(normalize-space(text()), "Доступные форматы:")][1]/following-sibling::div[1]/text())'
|
# <div class="l">
|
||||||
|
# <p>.epub, .fb2.zip, .pdf</p>
|
||||||
|
# </div>
|
||||||
|
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
|
||||||
formats = doc.xpath(xpt)
|
formats = doc.xpath(xpt)
|
||||||
if formats:
|
if formats:
|
||||||
result = True
|
result = True
|
||||||
|
Loading…
x
Reference in New Issue
Block a user