diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index f68ea4f678..4ebf344a2a 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en' Input plugin for HTML or OPF ebooks. ''' -import os, re, sys, uuid, tempfile, errno +import os, re, sys, uuid, tempfile from urlparse import urlparse, urlunparse from urllib import unquote from functools import partial diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 2d7bb73e9c..701394e1a5 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -116,7 +116,8 @@ def cap_author_token(token): lt = lower(token) if lt in ('von', 'de', 'el', 'van', 'le'): return lt - if re.match(r'([a-z]\.){2,}$', lt) is not None: + # no digits no spez. characters + if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None: # Normalize tokens of the form J.K. to J. K. parts = token.split('.') return '. '.join(map(capitalize, parts)).strip() diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index 3f5f956fae..fa9951c40c 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -28,7 +28,7 @@ class Ozon(Source): touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) # Test purpose only, test function does not like when sometimes some filed are empty - #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', + # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', # 'publisher', 'pubdate', 'comments']) supports_gzip_transfer_encoding = True @@ -109,8 +109,16 @@ class Ozon(Source): # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ + # some book titles have extra charactes like this + # TODO: make a twick + reRemoveFromTitle = None + #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') + title = unicode(title).upper() if title else '' - authors = map(unicode.upper, map(unicode, authors)) if authors else None + if reRemoveFromTitle: + title = reRemoveFromTitle.sub('', title) + authors = map(_normalizeAuthorNameWithInitials, + map(unicode.upper, map(unicode, authors))) if authors else None ozon_id = identifiers.get('ozon', None) unk = unicode(_('Unknown')).upper() @@ -124,6 +132,7 @@ class Ozon(Source): def in_authors(authors, miauthors): for author in authors: for miauthor in miauthors: + #log.debug(u'=> %s <> %s'%(author, miauthor)) if author in miauthor: return True return None @@ -131,7 +140,10 @@ class Ozon(Source): match = True if title: mititle = unicode(mi.title).upper() if mi.title else '' + if reRemoveFromTitle: + mititle = reRemoveFromTitle.sub('', mititle) match = title in mititle + #log.debug(u't=> %s <> %s'%(title, mititle)) if match and authors: miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] match = in_authors(authors, miauthors) @@ -190,7 +202,8 @@ class Ozon(Source): title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) - mi = Metadata(title, author.split(',')) + norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) + mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon':ozon_id} @@ -202,6 +215,11 @@ class Ozon(Source): if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) + pub_year = entry.xpath(xp_template.format('Year')) + if pub_year: + mi.pubdate = toPubdate(log, pub_year) + #log.debug('pubdate %s'%mi.pubdate) + rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: @@ -269,13 +287,17 @@ class Ozon(Source): raw = self.browser.open_novisit(url, timeout=timeout).read() doc = html.fromstring(raw) + xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' + xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' + # series - xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)' + xpt = xpt_prod_det_at % u'Сери' + # % u'Серия:' series = doc.xpath(xpt) if series: metadata.series = series - xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")' + xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' isbn_str = doc.xpath(xpt) if isbn_str: all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] @@ -283,38 +305,42 @@ class Ozon(Source): metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] - xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]' + xpt = xpt_prod_det_at % u'Издатель' publishers = doc.xpath(xpt) if publishers: - metadata.publisher = publishers[0].text + metadata.publisher = publishers - xpt = u'string(../text()[contains(., "г.")])' - yearIn = publishers[0].xpath(xpt) + displ_lang = None + xpt = xpt_prod_det_tx % u'Язык' + langs = doc.xpath(xpt) + if langs: + lng_splt = langs.split(u',') + if lng_splt: + displ_lang = lng_splt[0].strip() + metadata.language = _translageLanguageToCode(displ_lang) + #log.debug(u'language: %s'%displ_lang) + + # can be set before from xml search responce + if not metadata.pubdate: + xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])' + yearIn = doc.xpath(xpt) if yearIn: matcher = re.search(r'\d{4}', yearIn) if matcher: - year = int(matcher.group(0)) - # only year is available, so use 1-st of Jan - metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py - #metadata.pubdate = datetime(year, 1, 1) - xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")' - displLang = publishers[0].xpath(xpt) - lang_code =_translageLanguageToCode(displLang) - if lang_code: - metadata.language = lang_code + metadata.pubdate = toPubdate(log, matcher.group(0)) # overwrite comments from HTML if any - # tr/td[contains(.//text(), "От издателя")] -> does not work, why? - xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\ - u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]' + xpt = u'//table[@id="detail_description"]//tr/td' comment_elem = doc.xpath(xpt) if comment_elem: comments = unicode(etree.tostring(comment_elem[0])) if comments: # cleanup root tag, TODO: remove tags like object/embeded - comments = re.sub(r'^
+ # Доступно: + #
#.epub, .fb2.zip, .pdf
+ #