Fix bug processing author names with initials when downloading metadata from ozon.ru. Fixes #845420 (Problems with processing metadata in plugin ozon.ru)

This commit is contained in:
Kovid Goyal 2011-09-14 17:03:43 -06:00
commit c8a78a83bc
4 changed files with 109 additions and 45 deletions

View File

@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en'
Input plugin for HTML or OPF ebooks. Input plugin for HTML or OPF ebooks.
''' '''
import os, re, sys, uuid, tempfile, errno import os, re, sys, uuid, tempfile
from urlparse import urlparse, urlunparse from urlparse import urlparse, urlunparse
from urllib import unquote from urllib import unquote
from functools import partial from functools import partial

View File

@ -116,7 +116,8 @@ def cap_author_token(token):
lt = lower(token) lt = lower(token)
if lt in ('von', 'de', 'el', 'van', 'le'): if lt in ('von', 'de', 'el', 'van', 'le'):
return lt return lt
if re.match(r'([a-z]\.){2,}$', lt) is not None: # no digits no spez. characters
if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
# Normalize tokens of the form J.K. to J. K. # Normalize tokens of the form J.K. to J. K.
parts = token.split('.') parts = token.split('.')
return '. '.join(map(capitalize, parts)).strip() return '. '.join(map(capitalize, parts)).strip()

View File

@ -28,7 +28,7 @@ class Ozon(Source):
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
# Test purpose only, test function does not like when sometimes some filed are empty # Test purpose only, test function does not like when sometimes some filed are empty
#touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
# 'publisher', 'pubdate', 'comments']) # 'publisher', 'pubdate', 'comments'])
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
@ -109,8 +109,16 @@ class Ozon(Source):
# }}} # }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra charactes like this
# TODO: make a twick
reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
title = unicode(title).upper() if title else '' title = unicode(title).upper() if title else ''
authors = map(unicode.upper, map(unicode, authors)) if authors else None if reRemoveFromTitle:
title = reRemoveFromTitle.sub('', title)
authors = map(_normalizeAuthorNameWithInitials,
map(unicode.upper, map(unicode, authors))) if authors else None
ozon_id = identifiers.get('ozon', None) ozon_id = identifiers.get('ozon', None)
unk = unicode(_('Unknown')).upper() unk = unicode(_('Unknown')).upper()
@ -124,6 +132,7 @@ class Ozon(Source):
def in_authors(authors, miauthors): def in_authors(authors, miauthors):
for author in authors: for author in authors:
for miauthor in miauthors: for miauthor in miauthors:
#log.debug(u'=> %s <> %s'%(author, miauthor))
if author in miauthor: return True if author in miauthor: return True
return None return None
@ -131,7 +140,10 @@ class Ozon(Source):
match = True match = True
if title: if title:
mititle = unicode(mi.title).upper() if mi.title else '' mititle = unicode(mi.title).upper() if mi.title else ''
if reRemoveFromTitle:
mititle = reRemoveFromTitle.sub('', mititle)
match = title in mititle match = title in mititle
#log.debug(u't=> %s <> %s'%(title, mititle))
if match and authors: if match and authors:
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
match = in_authors(authors, miauthors) match = in_authors(authors, miauthors)
@ -190,7 +202,8 @@ class Ozon(Source):
title = entry.xpath(xp_template.format('Name')) title = entry.xpath(xp_template.format('Name'))
author = entry.xpath(xp_template.format('Author')) author = entry.xpath(xp_template.format('Author'))
mi = Metadata(title, author.split(',')) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
mi = Metadata(title, norm_authors)
ozon_id = entry.xpath(xp_template.format('ID')) ozon_id = entry.xpath(xp_template.format('ID'))
mi.identifiers = {'ozon':ozon_id} mi.identifiers = {'ozon':ozon_id}
@ -202,6 +215,11 @@ class Ozon(Source):
if cover: if cover:
mi.ozon_cover_url = _translateToBigCoverUrl(cover) mi.ozon_cover_url = _translateToBigCoverUrl(cover)
pub_year = entry.xpath(xp_template.format('Year'))
if pub_year:
mi.pubdate = toPubdate(log, pub_year)
#log.debug('pubdate %s'%mi.pubdate)
rating = entry.xpath(xp_template.format('ClientRatingValue')) rating = entry.xpath(xp_template.format('ClientRatingValue'))
if rating: if rating:
try: try:
@ -269,13 +287,17 @@ class Ozon(Source):
raw = self.browser.open_novisit(url, timeout=timeout).read() raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw) doc = html.fromstring(raw)
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
# series # series
xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)' xpt = xpt_prod_det_at % u'Сери'
# % u'Серия:'
series = doc.xpath(xpt) series = doc.xpath(xpt)
if series: if series:
metadata.series = series metadata.series = series
xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")' xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
isbn_str = doc.xpath(xpt) isbn_str = doc.xpath(xpt)
if isbn_str: if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
@ -283,38 +305,42 @@ class Ozon(Source):
metadata.all_isbns = all_isbns metadata.all_isbns = all_isbns
metadata.isbn = all_isbns[0] metadata.isbn = all_isbns[0]
xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]' xpt = xpt_prod_det_at % u'Издатель'
publishers = doc.xpath(xpt) publishers = doc.xpath(xpt)
if publishers: if publishers:
metadata.publisher = publishers[0].text metadata.publisher = publishers
xpt = u'string(../text()[contains(., "г.")])' displ_lang = None
yearIn = publishers[0].xpath(xpt) xpt = xpt_prod_det_tx % u'Язык'
langs = doc.xpath(xpt)
if langs:
lng_splt = langs.split(u',')
if lng_splt:
displ_lang = lng_splt[0].strip()
metadata.language = _translageLanguageToCode(displ_lang)
#log.debug(u'language: %s'%displ_lang)
# can be set before from xml search responce
if not metadata.pubdate:
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
yearIn = doc.xpath(xpt)
if yearIn: if yearIn:
matcher = re.search(r'\d{4}', yearIn) matcher = re.search(r'\d{4}', yearIn)
if matcher: if matcher:
year = int(matcher.group(0)) metadata.pubdate = toPubdate(log, matcher.group(0))
# only year is available, so use 1-st of Jan
metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
#metadata.pubdate = datetime(year, 1, 1)
xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
displLang = publishers[0].xpath(xpt)
lang_code =_translageLanguageToCode(displLang)
if lang_code:
metadata.language = lang_code
# overwrite comments from HTML if any # overwrite comments from HTML if any
# tr/td[contains(.//text(), "От издателя")] -> does not work, why? xpt = u'//table[@id="detail_description"]//tr/td'
xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]'
comment_elem = doc.xpath(xpt) comment_elem = doc.xpath(xpt)
if comment_elem: if comment_elem:
comments = unicode(etree.tostring(comment_elem[0])) comments = unicode(etree.tostring(comment_elem[0]))
if comments: if comments:
# cleanup root tag, TODO: remove tags like object/embeded # cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(r'^<td.+?>|</td>.+?$', u'', comments).strip() comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
if comments: if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments metadata.comments = comments
else:
log.debug('HTML book description skipped in favour of search service xml responce')
else: else:
log.debug('No book description found in HTML') log.debug('No book description found in HTML')
# }}} # }}}
@ -390,10 +416,40 @@ def _translageLanguageToCode(displayLang): # {{{
u'Итальянский': 'it', u'Итальянский': 'it',
u'Испанский': 'es', u'Испанский': 'es',
u'Китайский': 'zh', u'Китайский': 'zh',
u'Японский': 'ja' } u'Японский': 'ja',
u'Финский' : 'fi',
u'Польский' : 'pl',}
return langTbl.get(displayLang, None) return langTbl.get(displayLang, None)
# }}} # }}}
# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
def _normalizeAuthorNameWithInitials(name): # {{{
res = name
if name:
re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$'
re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
matcher = re.match(re1, unicode(name), re.UNICODE)
if not matcher:
matcher = re.match(re2, unicode(name), re.UNICODE)
if matcher:
d = matcher.groupdict()
res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
return res
# }}}
def toPubdate(log, yearAsString):
res = None
if yearAsString:
try:
year = int(yearAsString)
# only year is available, so use 1-st of Jan
res = datetime.datetime(year, 1, 1)
except:
log.error('cannot parse to date %s'%yearAsString)
return res
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
# comment some touched_fields before run thoses tests # comment some touched_fields before run thoses tests
@ -403,7 +459,12 @@ if __name__ == '__main__': # tests {{{
test_identify_plugin(Ozon.name, test_identify_plugin(Ozon.name,
[ [
# (
# {'identifiers':{}, 'title':u'Норвежский язык: Практический курс',
# 'authors':[u'Колесников В.П.', u'Г.В. Шатков']},
# [title_test(u'Норвежский язык: Практический курс', exact=True),
# authors_test([u'В. П. Колесников', u'Г. В. Шатков'])]
# ),
( (
{'identifiers':{'isbn': '9785916572629'} }, {'identifiers':{'isbn': '9785916572629'} },
[title_test(u'На все четыре стороны', exact=True), [title_test(u'На все четыре стороны', exact=True),

View File

@ -80,13 +80,15 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
doc = html.fromstring(f.read()) doc = html.fromstring(f.read())
# example where we are going to find formats # example where we are going to find formats
# <div class="box"> # <div class="l">
# ... # <p>
# <b>Доступные&nbsp;форматы:</b> # Доступно:
# <div class="vertpadd">.epub, .fb2, .pdf, .pdf, .txt</div> # </p>
# ...
# </div> # </div>
xpt = u'normalize-space(//div[@class="box"]//*[contains(normalize-space(text()), "Доступные форматы:")][1]/following-sibling::div[1]/text())' # <div class="l">
# <p>.epub, .fb2.zip, .pdf</p>
# </div>
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
formats = doc.xpath(xpt) formats = doc.xpath(xpt)
if formats: if formats:
result = True result = True