diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index f68ea4f678..4ebf344a2a 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en' Input plugin for HTML or OPF ebooks. ''' -import os, re, sys, uuid, tempfile, errno +import os, re, sys, uuid, tempfile from urlparse import urlparse, urlunparse from urllib import unquote from functools import partial diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 2d7bb73e9c..701394e1a5 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -116,7 +116,8 @@ def cap_author_token(token): lt = lower(token) if lt in ('von', 'de', 'el', 'van', 'le'): return lt - if re.match(r'([a-z]\.){2,}$', lt) is not None: + # no digits no spez. characters + if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None: # Normalize tokens of the form J.K. to J. K. parts = token.split('.') return '. '.join(map(capitalize, parts)).strip() diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index 3f5f956fae..fa9951c40c 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -28,7 +28,7 @@ class Ozon(Source): touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) # Test purpose only, test function does not like when sometimes some filed are empty - #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', + # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', # 'publisher', 'pubdate', 'comments']) supports_gzip_transfer_encoding = True @@ -109,8 +109,16 @@ class Ozon(Source): # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ + # some book titles have extra charactes like this + # TODO: make a twick + reRemoveFromTitle = None + #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') + title = unicode(title).upper() if title else '' - authors = map(unicode.upper, map(unicode, authors)) if authors else None + if reRemoveFromTitle: + title = reRemoveFromTitle.sub('', title) + authors = map(_normalizeAuthorNameWithInitials, + map(unicode.upper, map(unicode, authors))) if authors else None ozon_id = identifiers.get('ozon', None) unk = unicode(_('Unknown')).upper() @@ -124,6 +132,7 @@ class Ozon(Source): def in_authors(authors, miauthors): for author in authors: for miauthor in miauthors: + #log.debug(u'=> %s <> %s'%(author, miauthor)) if author in miauthor: return True return None @@ -131,7 +140,10 @@ class Ozon(Source): match = True if title: mititle = unicode(mi.title).upper() if mi.title else '' + if reRemoveFromTitle: + mititle = reRemoveFromTitle.sub('', mititle) match = title in mititle + #log.debug(u't=> %s <> %s'%(title, mititle)) if match and authors: miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] match = in_authors(authors, miauthors) @@ -190,7 +202,8 @@ class Ozon(Source): title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) - mi = Metadata(title, author.split(',')) + norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) + mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon':ozon_id} @@ -202,6 +215,11 @@ class Ozon(Source): if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) + pub_year = entry.xpath(xp_template.format('Year')) + if pub_year: + mi.pubdate = toPubdate(log, pub_year) + #log.debug('pubdate %s'%mi.pubdate) + rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: @@ -269,13 +287,17 @@ class Ozon(Source): raw = self.browser.open_novisit(url, timeout=timeout).read() doc = html.fromstring(raw) + xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' + xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' + # series - xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)' + xpt = xpt_prod_det_at % u'Сери' + # % u'Серия:' series = doc.xpath(xpt) if series: metadata.series = series - xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")' + xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' isbn_str = doc.xpath(xpt) if isbn_str: all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] @@ -283,38 +305,42 @@ class Ozon(Source): metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] - xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]' + xpt = xpt_prod_det_at % u'Издатель' publishers = doc.xpath(xpt) if publishers: - metadata.publisher = publishers[0].text + metadata.publisher = publishers - xpt = u'string(../text()[contains(., "г.")])' - yearIn = publishers[0].xpath(xpt) + displ_lang = None + xpt = xpt_prod_det_tx % u'Язык' + langs = doc.xpath(xpt) + if langs: + lng_splt = langs.split(u',') + if lng_splt: + displ_lang = lng_splt[0].strip() + metadata.language = _translageLanguageToCode(displ_lang) + #log.debug(u'language: %s'%displ_lang) + + # can be set before from xml search responce + if not metadata.pubdate: + xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])' + yearIn = doc.xpath(xpt) if yearIn: matcher = re.search(r'\d{4}', yearIn) if matcher: - year = int(matcher.group(0)) - # only year is available, so use 1-st of Jan - metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py - #metadata.pubdate = datetime(year, 1, 1) - xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")' - displLang = publishers[0].xpath(xpt) - lang_code =_translageLanguageToCode(displLang) - if lang_code: - metadata.language = lang_code + metadata.pubdate = toPubdate(log, matcher.group(0)) # overwrite comments from HTML if any - # tr/td[contains(.//text(), "От издателя")] -> does not work, why? - xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\ - u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]' + xpt = u'//table[@id="detail_description"]//tr/td' comment_elem = doc.xpath(xpt) if comment_elem: comments = unicode(etree.tostring(comment_elem[0])) if comments: # cleanup root tag, TODO: remove tags like object/embeded - comments = re.sub(r'^|.+?$', u'', comments).strip() - if comments: + comments = re.sub(r'\A.*?|.*\Z', u'', comments.strip(), re.MULTILINE).strip() + if comments and (not metadata.comments or len(comments) > len(metadata.comments)): metadata.comments = comments + else: + log.debug('HTML book description skipped in favour of search service xml responce') else: log.debug('No book description found in HTML') # }}} @@ -390,10 +416,40 @@ def _translageLanguageToCode(displayLang): # {{{ u'Итальянский': 'it', u'Испанский': 'es', u'Китайский': 'zh', - u'Японский': 'ja' } + u'Японский': 'ja', + u'Финский' : 'fi', + u'Польский' : 'pl',} return langTbl.get(displayLang, None) # }}} +# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников +def _normalizeAuthorNameWithInitials(name): # {{{ + res = name + if name: + re1 = u'^(?P\S+)\s+(?P[^\d\W]\.)(?:\s*(?P[^\d\W]\.))?$' + re2 = u'^(?P[^\d\W]\.)(?:\s*(?P[^\d\W]\.))?\s+(?P\S+)$' + matcher = re.match(re1, unicode(name), re.UNICODE) + if not matcher: + matcher = re.match(re2, unicode(name), re.UNICODE) + + if matcher: + d = matcher.groupdict() + res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x) + return res +# }}} + +def toPubdate(log, yearAsString): + res = None + if yearAsString: + try: + year = int(yearAsString) + # only year is available, so use 1-st of Jan + res = datetime.datetime(year, 1, 1) + except: + log.error('cannot parse to date %s'%yearAsString) + return res + + if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py # comment some touched_fields before run thoses tests @@ -403,40 +459,45 @@ if __name__ == '__main__': # tests {{{ test_identify_plugin(Ozon.name, [ - - ( +# ( +# {'identifiers':{}, 'title':u'Норвежский язык: Практический курс', +# 'authors':[u'Колесников В.П.', u'Г.В. Шатков']}, +# [title_test(u'Норвежский язык: Практический курс', exact=True), +# authors_test([u'В. П. Колесников', u'Г. В. Шатков'])] +# ), + ( {'identifiers':{'isbn': '9785916572629'} }, [title_test(u'На все четыре стороны', exact=True), authors_test([u'А. А. Гилл'])] - ), - ( + ), + ( {'identifiers':{}, 'title':u'Der Himmel Kennt Keine Gunstlinge', 'authors':[u'Erich Maria Remarque']}, [title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True), authors_test([u'Erich Maria Remarque'])] - ), - ( + ), + ( {'identifiers':{ }, 'title':u'Метро 2033', 'authors':[u'Дмитрий Глуховский']}, [title_test(u'Метро 2033', exact=False)] - ), - ( + ), + ( {'identifiers':{'isbn': '9785170727209'}, 'title':u'Метро 2033', 'authors':[u'Дмитрий Глуховский']}, [title_test(u'Метро 2033', exact=True), authors_test([u'Дмитрий Глуховский']), isbn_test('9785170727209')] - ), - ( + ), + ( {'identifiers':{'isbn': '5-699-13613-4'}, 'title':u'Метро 2033', 'authors':[u'Дмитрий Глуховский']}, [title_test(u'Метро 2033', exact=True), authors_test([u'Дмитрий Глуховский'])] - ), - ( + ), + ( {'identifiers':{}, 'title':u'Метро', 'authors':[u'Глуховский']}, [title_test(u'Метро', exact=False)] - ), + ), ]) # }}} diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 866c1c2732..3934ebbbb3 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -80,13 +80,15 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): doc = html.fromstring(f.read()) # example where we are going to find formats - #
- # ... - # Доступные форматы: - #
.epub, .fb2, .pdf, .pdf, .txt
- # ... + #
+ #

+ # Доступно: + #

#
- xpt = u'normalize-space(//div[@class="box"]//*[contains(normalize-space(text()), "Доступные форматы:")][1]/following-sibling::div[1]/text())' + #
+ #

.epub, .fb2.zip, .pdf

+ #
+ xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' formats = doc.xpath(xpt) if formats: result = True