mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
838a056ad2
@ -26,18 +26,24 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
delay = 1
|
delay = 1
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
INDEX = 'https://www.lrb.co.uk'
|
INDEX = 'https://www.lrb.co.uk/the-paper/'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
needs_subscription = True
|
needs_subscription = 'optional'
|
||||||
requires_version = (3, 0, 0)
|
requires_version = (3, 0, 0)
|
||||||
|
masthead_url = 'https://www.mylrb.co.uk/out/lrb-2014/img/logo-2x.png'
|
||||||
|
extra_css = '''
|
||||||
|
.article-word-count, #article-tag-holder { font-size:small; color:#202020; }
|
||||||
|
.embedded-image-caption { font-size:small; text-align:center; }
|
||||||
|
blockquote, em { color:#202020; }
|
||||||
|
'''
|
||||||
|
resolve_internal_links = True
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
classes('article-header--title paperArticle-reviewsHeader article-content letters-content'),
|
dict(attrs={'id':['article-tag-holder', 'article-heading-holder']}),
|
||||||
|
classes('article-copy article-word-count'),
|
||||||
]
|
]
|
||||||
remove_tags = [
|
|
||||||
classes('social-button article-mask lrb-readmorelink article-send-letter article-share'),
|
remove_attributes = ['style', 'width', 'height']
|
||||||
]
|
|
||||||
remove_attributes = ['width', 'height']
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
@ -52,22 +58,23 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
for h2 in soup.findAll('h2'):
|
||||||
|
h2.name = 'h4'
|
||||||
|
for cap in soup.findAll(**classes('embedded-image-caption')):
|
||||||
|
for p in cap.findAll('p'):
|
||||||
|
p.name = 'div'
|
||||||
for img in soup.findAll('img', attrs={'data-srcset': True}):
|
for img in soup.findAll('img', attrs={'data-srcset': True}):
|
||||||
for x in img['data-srcset'].split():
|
img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1]
|
||||||
if '/' in x:
|
|
||||||
img['src'] = x
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
articles = []
|
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
container = soup.find(attrs={'class': 'issue-grid'})
|
container = soup.find('div', attrs={'class': 'article-issue-cover-image'})
|
||||||
img = container.find('img')
|
if container:
|
||||||
self.cover_url = img['data-srcset'].split()[-2]
|
self.cover_url = 'https://www.lrb.co.uk/storage/800_filter/images/' + container.img['data-appsrc'].split('/images/')[-1]
|
||||||
h3 = container.find('h3')
|
edition = self.tag_to_string(soup.find('h1', attrs={'class': 'toc-title'}))
|
||||||
self.timefmt = ' [{}]'.format(self.tag_to_string(h3))
|
self.timefmt = ' [{}]'.format(edition)
|
||||||
a = img.findParent('a')
|
self.log('Downloading: ', edition)
|
||||||
soup = self.index_to_soup(absolutize(a['href']))
|
|
||||||
grid = soup.find(attrs={'class': 'toc-grid-items'})
|
grid = soup.find(attrs={'class': 'toc-grid-items'})
|
||||||
articles = []
|
articles = []
|
||||||
for a in grid.findAll(**classes('toc-item')):
|
for a in grid.findAll(**classes('toc-item')):
|
||||||
@ -77,5 +84,4 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
|
|||||||
title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4))
|
title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4))
|
||||||
self.log(title, url)
|
self.log(title, url)
|
||||||
articles.append({'title': title, 'url': url})
|
articles.append({'title': title, 'url': url})
|
||||||
|
|
||||||
return [('Articles', articles)]
|
return [('Articles', articles)]
|
||||||
|
@ -85,6 +85,11 @@ class WSJ(BasicNewsRecipe):
|
|||||||
div = col.findParent('div')
|
div = col.findParent('div')
|
||||||
if div:
|
if div:
|
||||||
div.extract()
|
div.extract()
|
||||||
|
time = soup.find('time')
|
||||||
|
if time:
|
||||||
|
p = time.findParent('div')
|
||||||
|
if p:
|
||||||
|
p.name = 'p'
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
if not past_edition:
|
if not past_edition:
|
||||||
@ -117,30 +122,26 @@ class WSJ(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
index = 'https://bartender.mobile.dowjones.io'
|
index = 'https://bartender.mobile.dowjones.io'
|
||||||
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
|
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
|
||||||
edit = []
|
edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
|
||||||
for itm in catalog['items']:
|
|
||||||
if itm['type'] == 'ITP':
|
|
||||||
edit.append(itm['key'][3:])
|
|
||||||
self.log('**Past Editions available :', ', '.join(edit))
|
self.log('**Past Editions available :', ', '.join(edit))
|
||||||
for itm in catalog['items']:
|
for itm in catalog['items']:
|
||||||
if past_edition:
|
if past_edition:
|
||||||
if itm['key'] == 'ITP' + past_edition:
|
if itm['key'] == 'ITP' + past_edition:
|
||||||
key = itm['key']
|
key = itm['key']
|
||||||
manifest = itm['manifest']
|
manifest = itm['manifest']
|
||||||
dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone)
|
date = itm['date']
|
||||||
dt = dt.strftime('%b %d, %Y')
|
|
||||||
self.log('Downloading Past Edition ', dt)
|
|
||||||
self.timefmt = ' [' + dt + ']'
|
|
||||||
break
|
break
|
||||||
elif itm['type'] == 'ITP':
|
elif itm['type'] == 'ITP':
|
||||||
key = itm['key']
|
key = itm['key']
|
||||||
manifest = itm['manifest']
|
manifest = itm['manifest']
|
||||||
dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone)
|
date = itm['date']
|
||||||
dt = dt.strftime('%b %d, %Y')
|
|
||||||
self.log('Downloading ', dt)
|
|
||||||
self.timefmt = ' [' + dt + ']'
|
|
||||||
break
|
break
|
||||||
|
|
||||||
|
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
|
||||||
|
dt = dt.strftime('%b %d, %Y')
|
||||||
|
self.log('Downloading ', dt)
|
||||||
|
self.timefmt = ' [' + dt + ']'
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
|
|
||||||
manif = json.loads(self.index_to_soup(index + manifest, raw=True))
|
manif = json.loads(self.index_to_soup(index + manifest, raw=True))
|
||||||
@ -179,6 +180,10 @@ class WSJ(BasicNewsRecipe):
|
|||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
if h1:
|
if h1:
|
||||||
h1['title'] = url['content']
|
h1['title'] = url['content']
|
||||||
|
h2 = soup.find('h2')
|
||||||
|
if h2:
|
||||||
|
h2['id'] = 'subhed'
|
||||||
|
h2.name = 'p'
|
||||||
return soup.prettify()
|
return soup.prettify()
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user