mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Granta
This commit is contained in:
parent
a57ea59adb
commit
a4b6b79829
@ -17,6 +17,12 @@ force_issue_download = None
|
|||||||
# Adapted from https://gist.github.com/FlyingTopHat/7cfdd5434ec704916174
|
# Adapted from https://gist.github.com/FlyingTopHat/7cfdd5434ec704916174
|
||||||
|
|
||||||
|
|
||||||
|
def classes(classes):
|
||||||
|
q = frozenset(classes.split(' '))
|
||||||
|
return dict(attrs={
|
||||||
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
|
||||||
def plus_with_unknown_component(first_comp, second_comp, result):
|
def plus_with_unknown_component(first_comp, second_comp, result):
|
||||||
if result is None:
|
if result is None:
|
||||||
return first_comp + second_comp
|
return first_comp + second_comp
|
||||||
@ -152,26 +158,6 @@ def text2num(s):
|
|||||||
##################################################################
|
##################################################################
|
||||||
|
|
||||||
|
|
||||||
##################################################################
|
|
||||||
# Utilities
|
|
||||||
def absurl(url):
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'https://www.granta.com' + url
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def stripstyle(tag):
|
|
||||||
if tag is not None:
|
|
||||||
del tag['style']
|
|
||||||
|
|
||||||
|
|
||||||
def get_innermost_string(tag):
|
|
||||||
while hasattr(tag, 'contents') and len(tag.contents) > 0 and tag.contents[0] is not None:
|
|
||||||
tag = tag.contents[0]
|
|
||||||
return str(tag).strip()
|
|
||||||
##################################################################
|
|
||||||
|
|
||||||
|
|
||||||
class Granta(BasicNewsRecipe):
|
class Granta(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Granta'
|
title = u'Granta'
|
||||||
@ -180,17 +166,17 @@ class Granta(BasicNewsRecipe):
|
|||||||
|
|
||||||
__author__ = 'Gary Arnold'
|
__author__ = 'Gary Arnold'
|
||||||
|
|
||||||
needs_subscription = True
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'class': 'article-feature-image-container'}),
|
classes(
|
||||||
dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 article-header'}),
|
'article-header article-content article-feature-image-standard-container article-feature-image-full-width-container'
|
||||||
dict(name='div', attrs={'class': 'carousel-inner'}),
|
),
|
||||||
dict(name='div', attrs={'class': 'article-content'}),
|
|
||||||
]
|
]
|
||||||
|
remove_tags = [
|
||||||
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
|
classes('social-share-container'),
|
||||||
m:'<head></head>')]
|
]
|
||||||
|
remove_attributes = ['style']
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
@ -219,26 +205,10 @@ class Granta(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
articleHeader = soup.find(
|
for div in soup.findAll(attrs={'data-background': True}):
|
||||||
'div', attrs={'class': 'article-feature-image-container'})
|
img = soup.new_tag('img')
|
||||||
if articleHeader is None:
|
img['src'] = div['data-background']
|
||||||
articleHeader = soup.find(
|
div.append(img)
|
||||||
'div', attrs={'class': lambda x: x and 'article-header' in x.split()})
|
|
||||||
if articleHeader is not None:
|
|
||||||
image = articleHeader.find(
|
|
||||||
'div', attrs={'class': 'article-feature-image'})
|
|
||||||
if image is not None and image.attrs is not None:
|
|
||||||
style = dict(image.attrs)['style']
|
|
||||||
if style is not None:
|
|
||||||
m = re.search(r'url\(([^\)]*)\)', style)
|
|
||||||
if m.group(1) is not None:
|
|
||||||
stripstyle(image)
|
|
||||||
image.name = 'img'
|
|
||||||
image['src'] = m.group(1)
|
|
||||||
|
|
||||||
stripstyle(articleHeader.find('h1'))
|
|
||||||
stripstyle(articleHeader.find('h2'))
|
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
@ -246,51 +216,38 @@ class Granta(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup('https://granta.com/')
|
soup = self.index_to_soup('https://granta.com/')
|
||||||
|
|
||||||
# Get latest issue
|
# Get latest issue
|
||||||
issueInfo = soup.find(
|
issueInfo = soup.find(**classes('featured_product__image'))
|
||||||
'div', attrs={'class': lambda x: x and 'dnd_container__heading' in x.split()})
|
issueAnchor = issueInfo.findParent('a', href=True)
|
||||||
|
|
||||||
issueAnchor = issueInfo.find('a')
|
|
||||||
issueTitle = issueAnchor.contents[0]
|
|
||||||
issueLink = issueAnchor.get('href')
|
issueLink = issueAnchor.get('href')
|
||||||
else:
|
else:
|
||||||
issueLink = force_issue_download
|
issueLink = force_issue_download
|
||||||
issueTitle = ''
|
|
||||||
|
|
||||||
|
self.log('Fetching issue:', issueLink)
|
||||||
soup = self.index_to_soup(issueLink)
|
soup = self.index_to_soup(issueLink)
|
||||||
|
# open('/t/raw.html', 'w').write(str(soup))
|
||||||
|
|
||||||
# Find cover
|
# Find cover
|
||||||
cover = soup.find('div', attrs={'class': 'product-img-container'})
|
cover = soup.find(**classes('single-issue__cover-image'))
|
||||||
if cover is not None:
|
if cover is not None:
|
||||||
img = cover.find('img', src=True)
|
self.cover_url = cover['data-background']
|
||||||
self.cover_url = absurl(img['src'])
|
|
||||||
self.log.info('Found cover at:', self.cover_url)
|
self.log.info('Found cover at:', self.cover_url)
|
||||||
|
|
||||||
# Find TOC
|
sections = {}
|
||||||
tocs = soup.findAll('div', attrs={'class': 'product-article'})
|
for item in soup.findAll(**classes('single-contributor_related-row_container')):
|
||||||
articles = []
|
h6 = item.find('h6')
|
||||||
for toc in tocs:
|
section = self.tag_to_string(h6.find('a')).strip()
|
||||||
if (self.username and self.password) or (toc.find('img') is None):
|
sections.setdefault(section, [])
|
||||||
# Either user is logged in or the article is unlocked
|
h1 = item.find('h1')
|
||||||
h1 = toc.find('h1')
|
title = self.tag_to_string(h1).strip()
|
||||||
h2 = toc.find('h2')
|
url = h1.findParent('a')['href']
|
||||||
if h1.find('a') is not None and len(h1.find('a').contents) > 0 and h1.find('a').contents[0] is not None:
|
author = self.tag_to_string(item.findAll('h3')[-1]).strip()
|
||||||
title = get_innermost_string(h1.find('a').contents[0])
|
desc = ''
|
||||||
elif len(h1.contents) > 0 and h1.contents[0] is not None:
|
for p in item.findAll('p'):
|
||||||
title = get_innermost_string(h1.contents[0])
|
desc += self.tag_to_string(p)
|
||||||
else:
|
sections[section].append({
|
||||||
title = ''
|
'title': title, 'url': url, 'description': 'by ' + author + '. ' + desc})
|
||||||
if h2.find('a') is not None and len(h2.find('a').contents) > 0 and h2.find('a').contents[0] is not None:
|
|
||||||
author = get_innermost_string(h2.find('a').contents[0])
|
|
||||||
title = title + u' (%s)' % author
|
|
||||||
elif len(h2.contents) > 0 and h2.contents[0] is not None:
|
|
||||||
author = get_innermost_string(h2.contents[0])
|
|
||||||
title = title + u' (%s)' % author
|
|
||||||
else:
|
|
||||||
author = ''
|
|
||||||
url = absurl(h1.find('a', href=True)['href'])
|
|
||||||
self.log.info('Found article:', title)
|
|
||||||
self.log.info('\t', url)
|
|
||||||
articles.append({'title': title, 'url': url,
|
|
||||||
'date': '', 'description': ''})
|
|
||||||
|
|
||||||
return [(issueTitle, articles)]
|
self.log.info('Found article:', title)
|
||||||
|
self.log.info('\t', url)
|
||||||
|
|
||||||
|
return [(sec, sections[sec]) for sec in sections]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user