Fix metadata download form Amazon stripping accents from comments text in the binary builds

Seems to be caused by a bug in the lxml version in the builds, which
causes it to output unicode chars as entities, which in turn were
getting stripped by the sanitize_html() function. Fixes #1825905 [amazon scrapper not retrieving accented caracters](https://bugs.launchpad.net/calibre/+bug/1825905)
This commit is contained in:
Kovid Goyal 2019-05-01 19:04:12 +05:30
parent ad9627f14a
commit 9714f722e7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -17,7 +17,7 @@ try:
except ImportError: except ImportError:
from urlparse import urlparse from urlparse import urlparse
from calibre import as_unicode, browser, random_user_agent from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
@ -577,6 +577,7 @@ class Worker(Thread): # Get details {{{
del a.attrib['href'] del a.attrib['href']
a.tag = 'span' a.tag = 'span'
desc = self.tostring(desc, method='html', encoding='unicode').strip() desc = self.tostring(desc, method='html', encoding='unicode').strip()
desc = xml_replace_entities(desc, 'utf-8')
# Encoding bug in Amazon data U+fffd (replacement char) # Encoding bug in Amazon data U+fffd (replacement char)
# in some examples it is present in place of ' # in some examples it is present in place of '
@ -862,7 +863,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 2, 8) version = (1, 2, 9)
minimum_calibre_version = (2, 82, 0) minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')