From 9714f722e72b8f166476a867c8c443d85ce19062 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 1 May 2019 19:04:12 +0530
Subject: [PATCH] Fix metadata download form Amazon stripping accents from
 comments text in the binary builds

Seems to be caused by a bug in the lxml version in the builds, which
causes it to output unicode chars as entities, which in turn were
getting stripped by the sanitize_html() function. Fixes #1825905 [amazon scrapper not retrieving accented caracters](https://bugs.launchpad.net/calibre/+bug/1825905)
---
 src/calibre/ebooks/metadata/sources/amazon.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index eca960a97a..94479d99dd 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -17,7 +17,7 @@ try:
 except ImportError:
     from urlparse import urlparse
 
-from calibre import as_unicode, browser, random_user_agent
+from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
@@ -577,6 +577,7 @@ class Worker(Thread):  # Get details {{{
             del a.attrib['href']
             a.tag = 'span'
         desc = self.tostring(desc, method='html', encoding='unicode').strip()
+        desc = xml_replace_entities(desc, 'utf-8')
 
         # Encoding bug in Amazon data U+fffd (replacement char)
         # in some examples it is present in place of '
@@ -862,7 +863,7 @@ class Worker(Thread):  # Get details {{{
 class Amazon(Source):
 
     name = 'Amazon.com'
-    version = (1, 2, 8)
+    version = (1, 2, 9)
     minimum_calibre_version = (2, 82, 0)
     description = _('Downloads metadata and covers from Amazon')