Deal with Amazon changing their websites to use (randomly?) two different html constructs around the format text

This commit is contained in:
Charles Haley 2015-03-08 12:44:15 +01:00
parent 0ca49a65f1
commit 993b9f6bcb
5 changed files with 46 additions and 20 deletions

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 6 # Needed for dynamic plugin loading store_version = 7 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -59,6 +59,7 @@ class AmazonDEKindleStore(StorePlugin):
allText = f.read() allText = f.read()
doc = html.fromstring(allText)#.decode('latin-1', 'replace')) doc = html.fromstring(allText)#.decode('latin-1', 'replace'))
format_xpath2 = ''
if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'): if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
#print('grid form') #print('grid form')
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
@ -89,8 +90,8 @@ class AmazonDEKindleStore(StorePlugin):
elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'): elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
#print('list form') #print('list form')
data_xpath = '//li[@class="s-result-item"]' data_xpath = '//li[@class="s-result-item"]'
format_xpath = ( format_xpath = './/a[contains(@class, "a-size-small")]/text()'
'.//h3[contains(@class, "s-inline")]/text()') format_xpath2 = './/h3[contains(@class, "s-inline")]/text()'
asin_xpath = '@data-asin' asin_xpath = '@data-asin'
cover_xpath = './/img[contains(@class, "cfMarker")]/@src' cover_xpath = './/img[contains(@class, "cfMarker")]/@src'
title_xpath = './/h2[contains(@class, "s-access-title")]/text()' title_xpath = './/h2[contains(@class, "s-access-title")]/text()'
@ -115,7 +116,11 @@ class AmazonDEKindleStore(StorePlugin):
# if it isn't. # if it isn't.
format_ = ''.join(data.xpath(format_xpath)) format_ = ''.join(data.xpath(format_xpath))
if 'kindle' not in format_.lower(): if 'kindle' not in format_.lower():
continue if format_xpath2:
format_ = ''.join(data.xpath(format_xpath2))
if 'kindle' not in format_.lower():
# print(etree.tostring(data, pretty_print=True))
continue
# We must have an asin otherwise we can't easily reference the # We must have an asin otherwise we can't easily reference the
# book later. # book later.

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 6 # Needed for dynamic plugin loading store_version = 7 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -58,6 +58,7 @@ class AmazonESKindleStore(StorePlugin):
allText = f.read() allText = f.read()
doc = html.fromstring(allText)#.decode('latin-1', 'replace')) doc = html.fromstring(allText)#.decode('latin-1', 'replace'))
format_xpath2 = ''
if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'): if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
#print('grid form') #print('grid form')
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
@ -88,8 +89,8 @@ class AmazonESKindleStore(StorePlugin):
elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'): elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
#print('list form') #print('list form')
data_xpath = '//li[@class="s-result-item"]' data_xpath = '//li[@class="s-result-item"]'
format_xpath = ( format_xpath = './/a[contains(@class, "a-size-small")]/text()'
'.//h3[contains(@class, "s-inline")]/text()') format_xpath2 = './/h3[contains(@class, "s-inline")]/text()'
asin_xpath = '@data-asin' asin_xpath = '@data-asin'
cover_xpath = './/img[contains(@class, "cfMarker")]/@src' cover_xpath = './/img[contains(@class, "cfMarker")]/@src'
title_xpath = './/h2[contains(@class, "s-access-title")]/text()' title_xpath = './/h2[contains(@class, "s-access-title")]/text()'
@ -114,7 +115,11 @@ class AmazonESKindleStore(StorePlugin):
# if it isn't. # if it isn't.
format_ = ''.join(data.xpath(format_xpath)) format_ = ''.join(data.xpath(format_xpath))
if 'kindle' not in format_.lower(): if 'kindle' not in format_.lower():
continue if format_xpath2:
format_ = ''.join(data.xpath(format_xpath2))
if 'kindle' not in format_.lower():
# print(etree.tostring(data, pretty_print=True))
continue
# We must have an asin otherwise we can't easily reference the # We must have an asin otherwise we can't easily reference the
# book later. # book later.

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 6 # Needed for dynamic plugin loading store_version = 7 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -55,6 +55,7 @@ class AmazonFRKindleStore(StorePlugin):
allText = f.read() allText = f.read()
doc = html.fromstring(allText)#.decode('latin-1', 'replace')) doc = html.fromstring(allText)#.decode('latin-1', 'replace'))
format_xpath2 = ''
if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'): if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
#print('grid form') #print('grid form')
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
@ -85,8 +86,8 @@ class AmazonFRKindleStore(StorePlugin):
elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'): elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
#print('list form') #print('list form')
data_xpath = '//li[@class="s-result-item"]' data_xpath = '//li[@class="s-result-item"]'
format_xpath = ( format_xpath = './/a[contains(@class, "a-size-small")]/text()'
'.//h3[contains(@class, "s-inline")]/text()') format_xpath2 = './/h3[contains(@class, "s-inline")]/text()'
asin_xpath = '@data-asin' asin_xpath = '@data-asin'
cover_xpath = './/img[contains(@class, "cfMarker")]/@src' cover_xpath = './/img[contains(@class, "cfMarker")]/@src'
title_xpath = './/h2[contains(@class, "s-access-title")]/text()' title_xpath = './/h2[contains(@class, "s-access-title")]/text()'
@ -111,7 +112,11 @@ class AmazonFRKindleStore(StorePlugin):
# if it isn't. # if it isn't.
format_ = ''.join(data.xpath(format_xpath)) format_ = ''.join(data.xpath(format_xpath))
if 'kindle' not in format_.lower(): if 'kindle' not in format_.lower():
continue if format_xpath2:
format_ = ''.join(data.xpath(format_xpath2))
if 'kindle' not in format_.lower():
# print(etree.tostring(data, pretty_print=True))
continue
# We must have an asin otherwise we can't easily reference the # We must have an asin otherwise we can't easily reference the
# book later. # book later.

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 6 # Needed for dynamic plugin loading store_version = 7 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -58,6 +58,7 @@ class AmazonITKindleStore(StorePlugin):
allText = f.read() allText = f.read()
doc = html.fromstring(allText)#.decode('latin-1', 'replace')) doc = html.fromstring(allText)#.decode('latin-1', 'replace'))
format_xpath2 = ''
if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'): if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
#print('grid form') #print('grid form')
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
@ -88,8 +89,8 @@ class AmazonITKindleStore(StorePlugin):
elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'): elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
#print('list form') #print('list form')
data_xpath = '//li[@class="s-result-item"]' data_xpath = '//li[@class="s-result-item"]'
format_xpath = ( format_xpath = './/a[contains(@class, "a-size-small")]/text()'
'.//h3[contains(@class, "s-inline")]/text()') format_xpath2 = './/h3[contains(@class, "s-inline")]/text()'
asin_xpath = '@data-asin' asin_xpath = '@data-asin'
cover_xpath = './/img[contains(@class, "cfMarker")]/@src' cover_xpath = './/img[contains(@class, "cfMarker")]/@src'
title_xpath = './/h2[contains(@class, "s-access-title")]/text()' title_xpath = './/h2[contains(@class, "s-access-title")]/text()'
@ -114,7 +115,11 @@ class AmazonITKindleStore(StorePlugin):
# if it isn't. # if it isn't.
format_ = ''.join(data.xpath(format_xpath)) format_ = ''.join(data.xpath(format_xpath))
if 'kindle' not in format_.lower(): if 'kindle' not in format_.lower():
continue if format_xpath2:
format_ = ''.join(data.xpath(format_xpath2))
if 'kindle' not in format_.lower():
# print(etree.tostring(data, pretty_print=True))
continue
# We must have an asin otherwise we can't easily reference the # We must have an asin otherwise we can't easily reference the
# book later. # book later.

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 6 # Needed for dynamic plugin loading store_version = 7 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
import re import re
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
# from lxml import html
from PyQt5.Qt import QUrl from PyQt5.Qt import QUrl
@ -62,6 +63,7 @@ class AmazonUKKindleStore(StorePlugin):
allText = f.read() allText = f.read()
doc = html.fromstring(allText)#.decode('latin-1', 'replace')) doc = html.fromstring(allText)#.decode('latin-1', 'replace'))
format_xpath2 = ''
if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'): if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
#print('grid form') #print('grid form')
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
@ -92,8 +94,8 @@ class AmazonUKKindleStore(StorePlugin):
elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'): elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
#print('list form') #print('list form')
data_xpath = '//li[@class="s-result-item"]' data_xpath = '//li[@class="s-result-item"]'
format_xpath = ( format_xpath = './/a[contains(@class, "a-size-small")]/text()'
'.//h3[contains(@class, "s-inline")]/text()') format_xpath2 = './/h3[contains(@class, "s-inline")]/text()'
asin_xpath = '@data-asin' asin_xpath = '@data-asin'
cover_xpath = './/img[contains(@class, "cfMarker")]/@src' cover_xpath = './/img[contains(@class, "cfMarker")]/@src'
title_xpath = './/h2[contains(@class, "s-access-title")]/text()' title_xpath = './/h2[contains(@class, "s-access-title")]/text()'
@ -118,7 +120,11 @@ class AmazonUKKindleStore(StorePlugin):
# if it isn't. # if it isn't.
format_ = ''.join(data.xpath(format_xpath)) format_ = ''.join(data.xpath(format_xpath))
if 'kindle' not in format_.lower(): if 'kindle' not in format_.lower():
continue if format_xpath2:
format_ = ''.join(data.xpath(format_xpath2))
if 'kindle' not in format_.lower():
# print(etree.tostring(data, pretty_print=True))
continue
# We must have an asin otherwise we can't easily reference the # We must have an asin otherwise we can't easily reference the
# book later. # book later.