Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru

This commit is contained in:
Kovid Goyal 2011-11-19 20:37:33 +05:30
commit 17f9da26c4
4 changed files with 39 additions and 24 deletions

View File

@ -710,7 +710,8 @@ class Metadata(object):
fmt('Title sort', self.title_sort)
if self.authors:
fmt('Author(s)', authors_to_string(self.authors) + \
((' [' + self.author_sort + ']') if self.author_sort else ''))
((' [' + self.author_sort + ']')
if self.author_sort and self.author_sort != _('Unknown') else ''))
if self.publisher:
fmt('Publisher', self.publisher)
if getattr(self, 'book_producer', False):

View File

@ -11,7 +11,7 @@ import datetime
from urllib import quote_plus
from Queue import Queue, Empty
from lxml import etree, html
from calibre import as_unicode
from calibre import prints, as_unicode
from calibre.ebooks.chardet import xml_to_unicode
@ -54,7 +54,8 @@ class Ozon(Source):
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
# div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
# for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn!
qItems = set([isbn, title])
@ -64,7 +65,7 @@ class Ozon(Source):
qItems.discard('')
qItems = map(_quoteString, qItems)
q = ' '.join(qItems).strip()
q = u' '.join(qItems).strip()
log.info(u'search string: ' + q)
if isinstance(q, unicode):
@ -78,13 +79,13 @@ class Ozon(Source):
return search_url
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): # {{{
if not self.is_configured():
return
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
if not query:
err = 'Insufficient metadata to construct query'
err = u'Insufficient metadata to construct query'
log.error(err)
return err
@ -109,7 +110,7 @@ class Ozon(Source):
# }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra charactes like this
# some book titles have extra characters like this
# TODO: make a twick
reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
@ -160,7 +161,7 @@ class Ozon(Source):
mi.source_relevance = i
if ensure_metadata_match(mi):
metadata.append(mi)
# log.debug(u'added metadata %s %s. '%(mi.title, mi.authors))
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
else:
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
return metadata
@ -285,12 +286,12 @@ class Ozon(Source):
url = self.get_book_url(metadata.get_identifiers())[2]
raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw)
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
# series
# series Серия/Серии
xpt = xpt_prod_det_at % u'Сери'
# % u'Серия:'
series = doc.xpath(xpt)
@ -300,7 +301,7 @@ class Ozon(Source):
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
isbn_str = doc.xpath(xpt)
if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
if all_isbns:
metadata.all_isbns = all_isbns
metadata.isbn = all_isbns[0]
@ -333,10 +334,10 @@ class Ozon(Source):
xpt = u'//table[@id="detail_description"]//tr/td'
comment_elem = doc.xpath(xpt)
if comment_elem:
comments = unicode(etree.tostring(comment_elem[0]))
comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
if comments:
# cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments
else:
@ -345,8 +346,16 @@ class Ozon(Source):
log.debug('No book description found in HTML')
# }}}
def _quoteString(str): # {{{
return '"' + str + '"' if str and str.find(' ') != -1 else str
def _quoteString(strToQuote): # {{{
return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
# }}}
def _verifyISBNIntegrity(log, isbn): # {{{
# Online ISBN-Check http://www.isbn-check.de/
res = check_isbn(isbn)
if not res:
log.error(u'ISBN integrity check failed for "%s"'%isbn)
return res is not None
# }}}
# TODO: make customizable
@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{
return res
# }}}
def toPubdate(log, yearAsString):
def toPubdate(log, yearAsString): # {{{
res = None
if yearAsString:
try:
@ -448,7 +457,11 @@ def toPubdate(log, yearAsString):
except:
log.error('cannot parse to date %s'%yearAsString)
return res
# }}}
def _listToUnicodePrintStr(lst): # {{{
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
# }}}
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py

View File

@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
result = False
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
raw = xml_to_unicode(f.read(), verbose=True)[0]
doc = html.fromstring(raw)
# example where we are going to find formats
# <div class="l">
@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
# <div class="l">
# <p>.epub, .fb2.zip, .pdf</p>
# </div>
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
formats = doc.xpath(xpt)
if formats:
result = True

View File

@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
msgid "Configure download metadata"
msgstr ""
msgstr "Настроить загрузку метаданных"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
msgid "Change how calibre downloads metadata"
@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
msgid "&Comments"
msgstr "Комментарии"
msgstr "&Комментарии"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
msgid "Basic metadata"
@ -12603,11 +12603,11 @@ msgstr "Основные метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has cover"
msgstr "Есть обложка"
msgstr "Обложка"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has summary"
msgstr ""
msgstr "Аннотация"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
msgid ""
@ -12619,7 +12619,7 @@ msgstr ""
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
msgid "See at"
msgstr ""
msgstr "Посмотреть на"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
msgid "calibre is downloading metadata from: "