mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru
This commit is contained in:
commit
17f9da26c4
@ -710,7 +710,8 @@ class Metadata(object):
|
||||
fmt('Title sort', self.title_sort)
|
||||
if self.authors:
|
||||
fmt('Author(s)', authors_to_string(self.authors) + \
|
||||
((' [' + self.author_sort + ']') if self.author_sort else ''))
|
||||
((' [' + self.author_sort + ']')
|
||||
if self.author_sort and self.author_sort != _('Unknown') else ''))
|
||||
if self.publisher:
|
||||
fmt('Publisher', self.publisher)
|
||||
if getattr(self, 'book_producer', False):
|
||||
|
@ -11,7 +11,7 @@ import datetime
|
||||
from urllib import quote_plus
|
||||
from Queue import Queue, Empty
|
||||
from lxml import etree, html
|
||||
from calibre import as_unicode
|
||||
from calibre import prints, as_unicode
|
||||
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
@ -55,6 +55,7 @@ class Ozon(Source):
|
||||
# div_book -> search only books, ebooks and audio books
|
||||
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
||||
|
||||
# for ozon.ru search we have to format ISBN with '-'
|
||||
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
||||
# TODO: format isbn!
|
||||
qItems = set([isbn, title])
|
||||
@ -64,7 +65,7 @@ class Ozon(Source):
|
||||
qItems.discard('')
|
||||
qItems = map(_quoteString, qItems)
|
||||
|
||||
q = ' '.join(qItems).strip()
|
||||
q = u' '.join(qItems).strip()
|
||||
log.info(u'search string: ' + q)
|
||||
|
||||
if isinstance(q, unicode):
|
||||
@ -78,13 +79,13 @@ class Ozon(Source):
|
||||
return search_url
|
||||
# }}}
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||
identifiers={}, timeout=30):
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||
identifiers={}, timeout=30): # {{{
|
||||
if not self.is_configured():
|
||||
return
|
||||
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
|
||||
if not query:
|
||||
err = 'Insufficient metadata to construct query'
|
||||
err = u'Insufficient metadata to construct query'
|
||||
log.error(err)
|
||||
return err
|
||||
|
||||
@ -109,7 +110,7 @@ class Ozon(Source):
|
||||
# }}}
|
||||
|
||||
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
||||
# some book titles have extra charactes like this
|
||||
# some book titles have extra characters like this
|
||||
# TODO: make a twick
|
||||
reRemoveFromTitle = None
|
||||
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
||||
@ -160,7 +161,7 @@ class Ozon(Source):
|
||||
mi.source_relevance = i
|
||||
if ensure_metadata_match(mi):
|
||||
metadata.append(mi)
|
||||
# log.debug(u'added metadata %s %s. '%(mi.title, mi.authors))
|
||||
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
||||
else:
|
||||
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
|
||||
return metadata
|
||||
@ -285,12 +286,12 @@ class Ozon(Source):
|
||||
url = self.get_book_url(metadata.get_identifiers())[2]
|
||||
|
||||
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
||||
doc = html.fromstring(raw)
|
||||
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
||||
|
||||
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
|
||||
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
|
||||
|
||||
# series
|
||||
# series Серия/Серии
|
||||
xpt = xpt_prod_det_at % u'Сери'
|
||||
# % u'Серия:'
|
||||
series = doc.xpath(xpt)
|
||||
@ -300,7 +301,7 @@ class Ozon(Source):
|
||||
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
|
||||
isbn_str = doc.xpath(xpt)
|
||||
if isbn_str:
|
||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
|
||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
||||
if all_isbns:
|
||||
metadata.all_isbns = all_isbns
|
||||
metadata.isbn = all_isbns[0]
|
||||
@ -333,10 +334,10 @@ class Ozon(Source):
|
||||
xpt = u'//table[@id="detail_description"]//tr/td'
|
||||
comment_elem = doc.xpath(xpt)
|
||||
if comment_elem:
|
||||
comments = unicode(etree.tostring(comment_elem[0]))
|
||||
comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
|
||||
if comments:
|
||||
# cleanup root tag, TODO: remove tags like object/embeded
|
||||
comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
|
||||
comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
|
||||
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
||||
metadata.comments = comments
|
||||
else:
|
||||
@ -345,8 +346,16 @@ class Ozon(Source):
|
||||
log.debug('No book description found in HTML')
|
||||
# }}}
|
||||
|
||||
def _quoteString(str): # {{{
|
||||
return '"' + str + '"' if str and str.find(' ') != -1 else str
|
||||
def _quoteString(strToQuote): # {{{
|
||||
return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
|
||||
# }}}
|
||||
|
||||
def _verifyISBNIntegrity(log, isbn): # {{{
|
||||
# Online ISBN-Check http://www.isbn-check.de/
|
||||
res = check_isbn(isbn)
|
||||
if not res:
|
||||
log.error(u'ISBN integrity check failed for "%s"'%isbn)
|
||||
return res is not None
|
||||
# }}}
|
||||
|
||||
# TODO: make customizable
|
||||
@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{
|
||||
return res
|
||||
# }}}
|
||||
|
||||
def toPubdate(log, yearAsString):
|
||||
def toPubdate(log, yearAsString): # {{{
|
||||
res = None
|
||||
if yearAsString:
|
||||
try:
|
||||
@ -448,7 +457,11 @@ def toPubdate(log, yearAsString):
|
||||
except:
|
||||
log.error('cannot parse to date %s'%yearAsString)
|
||||
return res
|
||||
# }}}
|
||||
|
||||
def _listToUnicodePrintStr(lst): # {{{
|
||||
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
|
||||
# }}}
|
||||
|
||||
if __name__ == '__main__': # tests {{{
|
||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
|
||||
|
@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
||||
|
||||
result = False
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
doc = html.fromstring(f.read())
|
||||
raw = xml_to_unicode(f.read(), verbose=True)[0]
|
||||
doc = html.fromstring(raw)
|
||||
|
||||
# example where we are going to find formats
|
||||
# <div class="l">
|
||||
@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
||||
# <div class="l">
|
||||
# <p>.epub, .fb2.zip, .pdf</p>
|
||||
# </div>
|
||||
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
|
||||
xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
|
||||
formats = doc.xpath(xpt)
|
||||
if formats:
|
||||
result = True
|
||||
|
@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
|
||||
msgid "Configure download metadata"
|
||||
msgstr ""
|
||||
msgstr "Настроить загрузку метаданных"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
|
||||
msgid "Change how calibre downloads metadata"
|
||||
@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
|
||||
msgid "&Comments"
|
||||
msgstr "Комментарии"
|
||||
msgstr "&Комментарии"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
|
||||
msgid "Basic metadata"
|
||||
@ -12603,11 +12603,11 @@ msgstr "Основные метаданные"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
|
||||
msgid "Has cover"
|
||||
msgstr "Есть обложка"
|
||||
msgstr "Обложка"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
|
||||
msgid "Has summary"
|
||||
msgstr ""
|
||||
msgstr "Аннотация"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
|
||||
msgid ""
|
||||
@ -12619,7 +12619,7 @@ msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
|
||||
msgid "See at"
|
||||
msgstr ""
|
||||
msgstr "Посмотреть на"
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
|
||||
msgid "calibre is downloading metadata from: "
|
||||
|
Loading…
x
Reference in New Issue
Block a user