From 9714f722e72b8f166476a867c8c443d85ce19062 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 1 May 2019 19:04:12 +0530 Subject: [PATCH] Fix metadata download form Amazon stripping accents from comments text in the binary builds Seems to be caused by a bug in the lxml version in the builds, which causes it to output unicode chars as entities, which in turn were getting stripped by the sanitize_html() function. Fixes #1825905 [amazon scrapper not retrieving accented caracters](https://bugs.launchpad.net/calibre/+bug/1825905) --- src/calibre/ebooks/metadata/sources/amazon.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index eca960a97a..94479d99dd 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -17,7 +17,7 @@ try: except ImportError: from urlparse import urlparse -from calibre import as_unicode, browser, random_user_agent +from calibre import as_unicode, browser, random_user_agent, xml_replace_entities from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase @@ -577,6 +577,7 @@ class Worker(Thread): # Get details {{{ del a.attrib['href'] a.tag = 'span' desc = self.tostring(desc, method='html', encoding='unicode').strip() + desc = xml_replace_entities(desc, 'utf-8') # Encoding bug in Amazon data U+fffd (replacement char) # in some examples it is present in place of ' @@ -862,7 +863,7 @@ class Worker(Thread): # Get details {{{ class Amazon(Source): name = 'Amazon.com' - version = (1, 2, 8) + version = (1, 2, 9) minimum_calibre_version = (2, 82, 0) description = _('Downloads metadata and covers from Amazon')