calibre/recipes/world_archeology.recipe
unkn0w7n b71e3ef705 Update recipe_specific_options
for feeds based recipes
2024-07-22 15:31:08 +05:30

115 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
https://www.world-archaeology.com
'''
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
class worldarch(BasicNewsRecipe):
title = 'The Past: Current World Archaeology'
language = 'en'
__author__ = 'unkn0wn'
description = (
'Travel the globe with Current World Archaeology, the magazine that brings you up-to-date with the latest archaeological discoveries. '
'Explore sites and sights through our exclusive features and eye-popping photography. We bring you the stories from the '
'archaeologists themselves, so you learn first-hand from the experts about the latest finds and most up-to-date research. '
'Published six times a year.'
)
no_stylesheets = True
use_embedded_content = False
remove_attributes = ['style', 'height', 'width']
ignore_duplicate_articles = {'url'}
resolve_internal_links = True
masthead_url = 'https://i0.wp.com/www.world-archaeology.com/wp-content/uploads/2016/02/cwa-logo.png'
simultaneous_downloads = 1
extra_css = '''
[class^="meta"] { font-size:small; }
.post-subtitle { font-style: italic; color:#202020; }
.wp-block-image { font-size:small; text-align:center; }
'''
keep_only_tags = [
dict(attrs={'class':lambda x: x and '__header' in x}),
dict(attrs={'class':lambda x: x and '__background' in x}),
dict(attrs={'class':lambda x: x and '__body_area' in x}),
]
remove_tags = [
dict(attrs={'class':'ad-break'}),
dict(attrs={'class':lambda x: x and 'avatar' in x.split()}),
dict(attrs={'class':lambda x: x and '--share' in x})
]
def preprocess_html(self, soup):
exp = soup.find(attrs={'class':lambda x: x and 'post-subtitle' in x.split()})
if exp:
exp.name = 'p'
return soup
recipe_specific_options = {
'issue': {
'short': 'Enter the Issue Number you want to download ',
'long': 'For example, 136'
}
}
def parse_index(self):
soup = self.index_to_soup('https://the-past.com/category/magazines/cwa/')
art = soup.find('article', attrs={'class':lambda x: x and 'tag-magazines' in x.split()})
url = art.h2.a['href']
d = self.recipe_specific_options.get('issue')
if d and isinstance(d, str):
url = 'https://the-past.com/magazines/current-world-archaeology-' + d + '/'
issue = self.index_to_soup(url)
ti = issue.find('h1', attrs={'class':lambda x: x and 'post-title' in x.split()})
if ti:
self.title = self.tag_to_string(ti).strip()
dt = soup.find(attrs={'class':lambda x: x and '__date' in x})
if dt:
self.timefmt = ' [' + self.tag_to_string(dt).strip() + ']'
edit = issue.find('h2', attrs={'id':'from-the-editor'})
if edit and edit.findParent('div'):
self.description = self.tag_to_string(edit.findParent('div'))
cov = issue.find('figure', attrs={'class':lambda x: x and 'wp-block-image' in x.split()})
if cov:
self.cover_url = cov.img['src']
div = issue.find('div', attrs={'class':lambda x: x and 'entry-content' in x.split()})
feeds = []
h2 = div.findAll('h2', attrs={'class':lambda x: x and 'wp-block-heading' in x.split()})
lt = div.findAll(attrs={'class':'display-posts-listing'})
for x, y in zip(h2, lt):
section = self.tag_to_string(x).strip()
self.log(section)
articles = []
for a in y.findAll('a', href=True, attrs={'class':'title'}):
url = a['href']
title = self.tag_to_string(a).strip()
desc = ''
exp = a.findNext(attrs={'class':'excerpt'})
if exp:
desc = self.tag_to_string(exp).strip()
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
articles.append({'title': title, 'description':desc, 'url': url})
if articles:
feeds.append((section, articles))
return feeds
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
br = browser()
return br.open_novisit(*args, **kwargs)
open = open_novisit