From c68b9b7d64ab08ae78bf461b0e4683ce54222aa2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Nov 2015 13:31:32 +0530 Subject: [PATCH] Update Amazon metadata download plugin to handle amazon.com website change that was preventing any metadata from being downloaded Fixes a bug in the default html5lib lxml treebuilder that caused it to fail on pages that have comments with -- or trailing hyphens. --- src/html5lib/treebuilders/etree_lxml.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/html5lib/treebuilders/etree_lxml.py b/src/html5lib/treebuilders/etree_lxml.py index 35d08efaa6..842f3ddbea 100644 --- a/src/html5lib/treebuilders/etree_lxml.py +++ b/src/html5lib/treebuilders/etree_lxml.py @@ -309,6 +309,11 @@ class TreeBuilder(_base.TreeBuilder): if (parent == self.document and self.document._elementTree.getroot()[-1].tag == comment_type): warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) + if data['data']: + # lxml cannot handle comment text that contains -- or endswith - + # Should really check if changes happened and issue a data loss + # warning, but that's a fairly big performance hit. + data['data'] = data['data'].replace('--', '\u2010\u2010').rstrip('-') super(TreeBuilder, self).insertComment(data, parent) def insertRoot(self, token):