Update Amazon metadata download plugin to handle amazon.com website change that was preventing any metadata from being downloaded

Fixes a bug in the default html5lib lxml treebuilder that caused it to
fail on pages that have comments with -- or trailing hyphens.
This commit is contained in:
Kovid Goyal 2015-11-21 13:31:32 +05:30
parent 9a9e5d5d7b
commit c68b9b7d64

View File

@ -309,6 +309,11 @@ class TreeBuilder(_base.TreeBuilder):
if (parent == self.document and if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type): self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
if data['data']:
# lxml cannot handle comment text that contains -- or endswith -
# Should really check if changes happened and issue a data loss
# warning, but that's a fairly big performance hit.
data['data'] = data['data'].replace('--', '\u2010\u2010').rstrip('-')
super(TreeBuilder, self).insertComment(data, parent) super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token): def insertRoot(self, token):