mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru
This commit is contained in:
commit
17f9da26c4
@ -710,7 +710,8 @@ class Metadata(object):
|
|||||||
fmt('Title sort', self.title_sort)
|
fmt('Title sort', self.title_sort)
|
||||||
if self.authors:
|
if self.authors:
|
||||||
fmt('Author(s)', authors_to_string(self.authors) + \
|
fmt('Author(s)', authors_to_string(self.authors) + \
|
||||||
((' [' + self.author_sort + ']') if self.author_sort else ''))
|
((' [' + self.author_sort + ']')
|
||||||
|
if self.author_sort and self.author_sort != _('Unknown') else ''))
|
||||||
if self.publisher:
|
if self.publisher:
|
||||||
fmt('Publisher', self.publisher)
|
fmt('Publisher', self.publisher)
|
||||||
if getattr(self, 'book_producer', False):
|
if getattr(self, 'book_producer', False):
|
||||||
|
@ -11,7 +11,7 @@ import datetime
|
|||||||
from urllib import quote_plus
|
from urllib import quote_plus
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
from calibre import as_unicode
|
from calibre import prints, as_unicode
|
||||||
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
@ -55,6 +55,7 @@ class Ozon(Source):
|
|||||||
# div_book -> search only books, ebooks and audio books
|
# div_book -> search only books, ebooks and audio books
|
||||||
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
||||||
|
|
||||||
|
# for ozon.ru search we have to format ISBN with '-'
|
||||||
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
||||||
# TODO: format isbn!
|
# TODO: format isbn!
|
||||||
qItems = set([isbn, title])
|
qItems = set([isbn, title])
|
||||||
@ -64,7 +65,7 @@ class Ozon(Source):
|
|||||||
qItems.discard('')
|
qItems.discard('')
|
||||||
qItems = map(_quoteString, qItems)
|
qItems = map(_quoteString, qItems)
|
||||||
|
|
||||||
q = ' '.join(qItems).strip()
|
q = u' '.join(qItems).strip()
|
||||||
log.info(u'search string: ' + q)
|
log.info(u'search string: ' + q)
|
||||||
|
|
||||||
if isinstance(q, unicode):
|
if isinstance(q, unicode):
|
||||||
@ -78,13 +79,13 @@ class Ozon(Source):
|
|||||||
return search_url
|
return search_url
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
identifiers={}, timeout=30):
|
identifiers={}, timeout=30): # {{{
|
||||||
if not self.is_configured():
|
if not self.is_configured():
|
||||||
return
|
return
|
||||||
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
|
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
|
||||||
if not query:
|
if not query:
|
||||||
err = 'Insufficient metadata to construct query'
|
err = u'Insufficient metadata to construct query'
|
||||||
log.error(err)
|
log.error(err)
|
||||||
return err
|
return err
|
||||||
|
|
||||||
@ -109,7 +110,7 @@ class Ozon(Source):
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
||||||
# some book titles have extra charactes like this
|
# some book titles have extra characters like this
|
||||||
# TODO: make a twick
|
# TODO: make a twick
|
||||||
reRemoveFromTitle = None
|
reRemoveFromTitle = None
|
||||||
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
||||||
@ -285,12 +286,12 @@ class Ozon(Source):
|
|||||||
url = self.get_book_url(metadata.get_identifiers())[2]
|
url = self.get_book_url(metadata.get_identifiers())[2]
|
||||||
|
|
||||||
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
||||||
doc = html.fromstring(raw)
|
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
||||||
|
|
||||||
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
|
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
|
||||||
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
|
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
|
||||||
|
|
||||||
# series
|
# series Серия/Серии
|
||||||
xpt = xpt_prod_det_at % u'Сери'
|
xpt = xpt_prod_det_at % u'Сери'
|
||||||
# % u'Серия:'
|
# % u'Серия:'
|
||||||
series = doc.xpath(xpt)
|
series = doc.xpath(xpt)
|
||||||
@ -300,7 +301,7 @@ class Ozon(Source):
|
|||||||
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
|
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
|
||||||
isbn_str = doc.xpath(xpt)
|
isbn_str = doc.xpath(xpt)
|
||||||
if isbn_str:
|
if isbn_str:
|
||||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
|
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
||||||
if all_isbns:
|
if all_isbns:
|
||||||
metadata.all_isbns = all_isbns
|
metadata.all_isbns = all_isbns
|
||||||
metadata.isbn = all_isbns[0]
|
metadata.isbn = all_isbns[0]
|
||||||
@ -333,10 +334,10 @@ class Ozon(Source):
|
|||||||
xpt = u'//table[@id="detail_description"]//tr/td'
|
xpt = u'//table[@id="detail_description"]//tr/td'
|
||||||
comment_elem = doc.xpath(xpt)
|
comment_elem = doc.xpath(xpt)
|
||||||
if comment_elem:
|
if comment_elem:
|
||||||
comments = unicode(etree.tostring(comment_elem[0]))
|
comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
|
||||||
if comments:
|
if comments:
|
||||||
# cleanup root tag, TODO: remove tags like object/embeded
|
# cleanup root tag, TODO: remove tags like object/embeded
|
||||||
comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
|
comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
|
||||||
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
||||||
metadata.comments = comments
|
metadata.comments = comments
|
||||||
else:
|
else:
|
||||||
@ -345,8 +346,16 @@ class Ozon(Source):
|
|||||||
log.debug('No book description found in HTML')
|
log.debug('No book description found in HTML')
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def _quoteString(str): # {{{
|
def _quoteString(strToQuote): # {{{
|
||||||
return '"' + str + '"' if str and str.find(' ') != -1 else str
|
return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def _verifyISBNIntegrity(log, isbn): # {{{
|
||||||
|
# Online ISBN-Check http://www.isbn-check.de/
|
||||||
|
res = check_isbn(isbn)
|
||||||
|
if not res:
|
||||||
|
log.error(u'ISBN integrity check failed for "%s"'%isbn)
|
||||||
|
return res is not None
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# TODO: make customizable
|
# TODO: make customizable
|
||||||
@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{
|
|||||||
return res
|
return res
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def toPubdate(log, yearAsString):
|
def toPubdate(log, yearAsString): # {{{
|
||||||
res = None
|
res = None
|
||||||
if yearAsString:
|
if yearAsString:
|
||||||
try:
|
try:
|
||||||
@ -448,7 +457,11 @@ def toPubdate(log, yearAsString):
|
|||||||
except:
|
except:
|
||||||
log.error('cannot parse to date %s'%yearAsString)
|
log.error('cannot parse to date %s'%yearAsString)
|
||||||
return res
|
return res
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def _listToUnicodePrintStr(lst): # {{{
|
||||||
|
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
|
||||||
|
# }}}
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
|
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
|
||||||
|
@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
|||||||
|
|
||||||
result = False
|
result = False
|
||||||
with closing(br.open(url, timeout=timeout)) as f:
|
with closing(br.open(url, timeout=timeout)) as f:
|
||||||
doc = html.fromstring(f.read())
|
raw = xml_to_unicode(f.read(), verbose=True)[0]
|
||||||
|
doc = html.fromstring(raw)
|
||||||
|
|
||||||
# example where we are going to find formats
|
# example where we are going to find formats
|
||||||
# <div class="l">
|
# <div class="l">
|
||||||
@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
|||||||
# <div class="l">
|
# <div class="l">
|
||||||
# <p>.epub, .fb2.zip, .pdf</p>
|
# <p>.epub, .fb2.zip, .pdf</p>
|
||||||
# </div>
|
# </div>
|
||||||
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
|
xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
|
||||||
formats = doc.xpath(xpt)
|
formats = doc.xpath(xpt)
|
||||||
if formats:
|
if formats:
|
||||||
result = True
|
result = True
|
||||||
|
@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные"
|
|||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
|
||||||
msgid "Configure download metadata"
|
msgid "Configure download metadata"
|
||||||
msgstr ""
|
msgstr "Настроить загрузку метаданных"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
|
||||||
msgid "Change how calibre downloads metadata"
|
msgid "Change how calibre downloads metadata"
|
||||||
@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные"
|
|||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
|
||||||
msgid "&Comments"
|
msgid "&Comments"
|
||||||
msgstr "Комментарии"
|
msgstr "&Комментарии"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
|
||||||
msgid "Basic metadata"
|
msgid "Basic metadata"
|
||||||
@ -12603,11 +12603,11 @@ msgstr "Основные метаданные"
|
|||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
|
||||||
msgid "Has cover"
|
msgid "Has cover"
|
||||||
msgstr "Есть обложка"
|
msgstr "Обложка"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
|
||||||
msgid "Has summary"
|
msgid "Has summary"
|
||||||
msgstr ""
|
msgstr "Аннотация"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
|
||||||
msgid ""
|
msgid ""
|
||||||
@ -12619,7 +12619,7 @@ msgstr ""
|
|||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
|
||||||
msgid "See at"
|
msgid "See at"
|
||||||
msgstr ""
|
msgstr "Посмотреть на"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
|
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
|
||||||
msgid "calibre is downloading metadata from: "
|
msgid "calibre is downloading metadata from: "
|
||||||
|
Loading…
x
Reference in New Issue
Block a user