Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru

This commit is contained in:
Kovid Goyal 2011-11-19 20:37:33 +05:30
commit 17f9da26c4
4 changed files with 39 additions and 24 deletions

View File

@ -710,7 +710,8 @@ class Metadata(object):
fmt('Title sort', self.title_sort) fmt('Title sort', self.title_sort)
if self.authors: if self.authors:
fmt('Author(s)', authors_to_string(self.authors) + \ fmt('Author(s)', authors_to_string(self.authors) + \
((' [' + self.author_sort + ']') if self.author_sort else '')) ((' [' + self.author_sort + ']')
if self.author_sort and self.author_sort != _('Unknown') else ''))
if self.publisher: if self.publisher:
fmt('Publisher', self.publisher) fmt('Publisher', self.publisher)
if getattr(self, 'book_producer', False): if getattr(self, 'book_producer', False):

View File

@ -11,7 +11,7 @@ import datetime
from urllib import quote_plus from urllib import quote_plus
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml import etree, html from lxml import etree, html
from calibre import as_unicode from calibre import prints, as_unicode
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -55,6 +55,7 @@ class Ozon(Source):
# div_book -> search only books, ebooks and audio books # div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
# for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None)) isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn! # TODO: format isbn!
qItems = set([isbn, title]) qItems = set([isbn, title])
@ -64,7 +65,7 @@ class Ozon(Source):
qItems.discard('') qItems.discard('')
qItems = map(_quoteString, qItems) qItems = map(_quoteString, qItems)
q = ' '.join(qItems).strip() q = u' '.join(qItems).strip()
log.info(u'search string: ' + q) log.info(u'search string: ' + q)
if isinstance(q, unicode): if isinstance(q, unicode):
@ -78,13 +79,13 @@ class Ozon(Source):
return search_url return search_url
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): identifiers={}, timeout=30): # {{{
if not self.is_configured(): if not self.is_configured():
return return
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
if not query: if not query:
err = 'Insufficient metadata to construct query' err = u'Insufficient metadata to construct query'
log.error(err) log.error(err)
return err return err
@ -109,7 +110,7 @@ class Ozon(Source):
# }}} # }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra charactes like this # some book titles have extra characters like this
# TODO: make a twick # TODO: make a twick
reRemoveFromTitle = None reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
@ -285,12 +286,12 @@ class Ozon(Source):
url = self.get_book_url(metadata.get_identifiers())[2] url = self.get_book_url(metadata.get_identifiers())[2]
raw = self.browser.open_novisit(url, timeout=timeout).read() raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw) doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
# series # series Серия/Серии
xpt = xpt_prod_det_at % u'Сери' xpt = xpt_prod_det_at % u'Сери'
# % u'Серия:' # % u'Серия:'
series = doc.xpath(xpt) series = doc.xpath(xpt)
@ -300,7 +301,7 @@ class Ozon(Source):
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
isbn_str = doc.xpath(xpt) isbn_str = doc.xpath(xpt)
if isbn_str: if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
if all_isbns: if all_isbns:
metadata.all_isbns = all_isbns metadata.all_isbns = all_isbns
metadata.isbn = all_isbns[0] metadata.isbn = all_isbns[0]
@ -333,10 +334,10 @@ class Ozon(Source):
xpt = u'//table[@id="detail_description"]//tr/td' xpt = u'//table[@id="detail_description"]//tr/td'
comment_elem = doc.xpath(xpt) comment_elem = doc.xpath(xpt)
if comment_elem: if comment_elem:
comments = unicode(etree.tostring(comment_elem[0])) comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
if comments: if comments:
# cleanup root tag, TODO: remove tags like object/embeded # cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip() comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
if comments and (not metadata.comments or len(comments) > len(metadata.comments)): if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments metadata.comments = comments
else: else:
@ -345,8 +346,16 @@ class Ozon(Source):
log.debug('No book description found in HTML') log.debug('No book description found in HTML')
# }}} # }}}
def _quoteString(str): # {{{ def _quoteString(strToQuote): # {{{
return '"' + str + '"' if str and str.find(' ') != -1 else str return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
# }}}
def _verifyISBNIntegrity(log, isbn): # {{{
# Online ISBN-Check http://www.isbn-check.de/
res = check_isbn(isbn)
if not res:
log.error(u'ISBN integrity check failed for "%s"'%isbn)
return res is not None
# }}} # }}}
# TODO: make customizable # TODO: make customizable
@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{
return res return res
# }}} # }}}
def toPubdate(log, yearAsString): def toPubdate(log, yearAsString): # {{{
res = None res = None
if yearAsString: if yearAsString:
try: try:
@ -448,7 +457,11 @@ def toPubdate(log, yearAsString):
except: except:
log.error('cannot parse to date %s'%yearAsString) log.error('cannot parse to date %s'%yearAsString)
return res return res
# }}}
def _listToUnicodePrintStr(lst): # {{{
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
# }}}
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py

View File

@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
result = False result = False
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read()) raw = xml_to_unicode(f.read(), verbose=True)[0]
doc = html.fromstring(raw)
# example where we are going to find formats # example where we are going to find formats
# <div class="l"> # <div class="l">
@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
# <div class="l"> # <div class="l">
# <p>.epub, .fb2.zip, .pdf</p> # <p>.epub, .fb2.zip, .pdf</p>
# </div> # </div>
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
formats = doc.xpath(xpt) formats = doc.xpath(xpt)
if formats: if formats:
result = True result = True

View File

@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
msgid "Configure download metadata" msgid "Configure download metadata"
msgstr "" msgstr "Настроить загрузку метаданных"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
msgid "Change how calibre downloads metadata" msgid "Change how calibre downloads metadata"
@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
msgid "&Comments" msgid "&Comments"
msgstr "Комментарии" msgstr "&Комментарии"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
msgid "Basic metadata" msgid "Basic metadata"
@ -12603,11 +12603,11 @@ msgstr "Основные метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has cover" msgid "Has cover"
msgstr "Есть обложка" msgstr "Обложка"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has summary" msgid "Has summary"
msgstr "" msgstr "Аннотация"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
msgid "" msgid ""
@ -12619,7 +12619,7 @@ msgstr ""
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
msgid "See at" msgid "See at"
msgstr "" msgstr "Посмотреть на"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
msgid "calibre is downloading metadata from: " msgid "calibre is downloading metadata from: "