Amazon metadata download plugin: Improved parsing of broken HTML

This commit is contained in:
Kovid Goyal 2010-10-31 12:01:16 -06:00
parent e268beaa90
commit 92fe7d3725

View File

@ -9,6 +9,7 @@ Fetch metadata using Amazon AWS
import sys, re import sys, re
from lxml import html from lxml import html
from lxml.html import soupparser
from calibre import browser from calibre import browser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
@ -71,7 +72,7 @@ def get_metadata(br, asin, mi):
return False return False
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
root = html.fromstring(raw) root = soupparser.fromstring(raw)
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
if ratings: if ratings:
pat = re.compile(r'([0-9.]+) out of (\d+) stars') pat = re.compile(r'([0-9.]+) out of (\d+) stars')