mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Merge branch 'patch-20' of https://github.com/Sophist-UK/calibre
This commit is contained in:
commit
050c9caa3d
@ -1,49 +1,182 @@
|
|||||||
|
'''
|
||||||
|
Fetch Private Eye (Online Edition)
|
||||||
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
class PrivateEyeRecipe(BasicNewsRecipe):
|
||||||
|
##
|
||||||
|
# Last Edited: 2023-07-01
|
||||||
|
#
|
||||||
|
# Remark: Version 3.0
|
||||||
|
# Rewrite (by Sophist-UK) to fit latest web pages, correctly identify pages to include
|
||||||
|
# and improve formatting.
|
||||||
|
# Edited to add: inclusion of About page,
|
||||||
|
# identifying series number and publication date and setting metadata.
|
||||||
|
#
|
||||||
|
|
||||||
class AdvancedUserRecipe1359406781(BasicNewsRecipe):
|
title = u'Private Eye (Online Edition)'
|
||||||
title = u'Private Eye'
|
|
||||||
publication_type = 'magazine'
|
|
||||||
description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop'
|
description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop'
|
||||||
oldest_article = 13
|
publication_type = 'magazine'
|
||||||
max_articles_per_feed = 100
|
|
||||||
remove_empty_feeds = True
|
|
||||||
remove_javascript = True
|
|
||||||
no_stylesheets = True
|
|
||||||
ignore_duplicate_articles = {'title'}
|
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
__author__ = u'Martyn Pritchard'
|
oldest_article = 13
|
||||||
__copyright__ = '2020, Martyn Pritchard <MPritchard2k9@gmail.com>'
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript = True
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
|
__author__ = u'Martyn Pritchard & Sophist-UK'
|
||||||
|
__copyright__ = '2020, Martyn Pritchard <MPritchard2k9@gmail.com> & Sophist-UK <sophist-uk@sodalis.co.uk>'
|
||||||
|
|
||||||
|
current_issue = 'https://www.private-eye.co.uk/current-issue'
|
||||||
|
about_page = 'https://www.private-eye.co.uk/about'
|
||||||
|
masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png'
|
||||||
|
author = 'Private Eye'
|
||||||
|
series = title = 'Private Eye Online'
|
||||||
|
conversion_options = {
|
||||||
|
'authors': author,
|
||||||
|
'author_sort': author,
|
||||||
|
'series': series,
|
||||||
|
'series_index': 0,
|
||||||
|
'title': title,
|
||||||
|
'title_sort': title,
|
||||||
|
}
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover_url = None
|
soup = self.index_to_soup(self.current_issue)
|
||||||
soup = self.index_to_soup('https://www.private-eye.co.uk')
|
|
||||||
for citem in soup.findAll('img'):
|
|
||||||
if citem['src'].endswith('big.jpg'):
|
|
||||||
return citem['src']
|
|
||||||
return cover_url
|
|
||||||
|
|
||||||
remove_tags_before = {'class': "article"}
|
for img in soup.findAll('img'):
|
||||||
remove_tags_after = {'class': "article"}
|
src = img['src']
|
||||||
remove_tags = [dict(name='div', attrs={'id': 'sections-sidebar'})]
|
if src.endswith('_big.jpg'):
|
||||||
remove_tags = {'class': "sub-nav-bar"}
|
file_name = src.rsplit('/',1)[1]
|
||||||
remove_tags = [dict(name='a', attrs={'class': 'twitter-share-button'})]
|
if file_name is None:
|
||||||
remove_tags = [dict(name='div', attrs={'id': 'nav-box-sections-mobile'})]
|
file_name = src
|
||||||
|
try:
|
||||||
|
self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])})
|
||||||
|
self.log('series-index:', self.conversion_options['series_index'])
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
# wrong big image
|
||||||
|
continue
|
||||||
|
return src
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.current_issue)
|
||||||
|
|
||||||
|
# Get publication date
|
||||||
|
sidebar = soup.find('div', attrs={'id': 'current-issue-sidebar'})
|
||||||
|
next_issue_text = sidebar.find('b').nextSibling.strip()
|
||||||
|
try:
|
||||||
|
day, month, year = next_issue_text.split(' ')
|
||||||
|
day = ''.join(c for c in day if c.isdigit())
|
||||||
|
pub_date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y") - timedelta(12)
|
||||||
|
self.log('pub-date:', pub_date)
|
||||||
|
self.conversion_options.update({'pubdate': datetime.strftime(pub_date, "%d %B %Y").lstrip("0")})
|
||||||
|
title = self.title + " " + datetime.strftime(pub_date, "%Y-%m-%d")
|
||||||
|
self.conversion_options.update({'title': title})
|
||||||
|
self.conversion_options.update({'title_sort': title})
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
# Bad date
|
||||||
|
self.log('Cannot parse next issue date from:', next_issue_date)
|
||||||
|
|
||||||
|
# Get pages first from the sub-menu, and then from the contents panel.
|
||||||
|
# Duplicates will be eliminated automatically.
|
||||||
|
articles = []
|
||||||
|
for menu_attrs in (
|
||||||
|
{'class': 'sub-nav-bar', 'id':'sub-nav-box'},
|
||||||
|
{'class': 'article', 'id': 'block-left'},
|
||||||
|
):
|
||||||
|
menu = soup.find('div', attrs=menu_attrs)
|
||||||
|
|
||||||
|
if not menu:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for a in menu.findAll('a', href=True):
|
||||||
|
title = a.getText().rstrip(' »\n')
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
articles.append({
|
||||||
|
'title': title,
|
||||||
|
'url': a.get('href'),
|
||||||
|
})
|
||||||
|
|
||||||
|
if not articles:
|
||||||
|
abort_recipe_processing('Private-Eye Online index of pages not found');
|
||||||
|
|
||||||
|
# Add the About page as a final article
|
||||||
|
articles.append({
|
||||||
|
'title': 'About Private Eye',
|
||||||
|
'url': self.about_page,
|
||||||
|
})
|
||||||
|
|
||||||
|
self.log('parse_index:', articles)
|
||||||
|
|
||||||
|
return [('Private Eye', articles)]
|
||||||
|
|
||||||
|
# We remove vast swathes of HTML which is not part of the articles.
|
||||||
|
# Remove sibling content
|
||||||
|
remove_tags_before = [
|
||||||
|
{'name': 'div', 'class': "article"},
|
||||||
|
{'name': 'div', 'id': "page"},
|
||||||
|
{'name': 'div', 'id': "page-wide"},
|
||||||
|
{'name': 'div', 'id': "content"},
|
||||||
|
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
|
||||||
|
]
|
||||||
|
remove_tags_after = remove_tags_before.copy()
|
||||||
|
remove_tags_after.append(
|
||||||
|
{'name': 'div', 'id': 'about-covers'},
|
||||||
|
)
|
||||||
|
# Remove non-sibling content
|
||||||
|
remove_tags = [
|
||||||
|
{'name': 'div', 'attrs': {'id': 'top-bar'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'header-wide'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'footer-wide'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'follow-buttons'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'sidebar'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'sections-sidebar'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'nav-box-pages-mobile'}},
|
||||||
|
{'name': 'div', 'attrs': {'id': 'about-covers'}},
|
||||||
|
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
|
||||||
|
{'name': 'iframe'},
|
||||||
|
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}},
|
||||||
|
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}},
|
||||||
|
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert headers to h1, strapline to h4
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(
|
(
|
||||||
re.compile(
|
re.compile(
|
||||||
r'<a href="https://www.subscription.*?</a>',
|
r'<span class="headline(?:-new)?">(.*?)</span>\s*(?:<br>\s*)*(?:<span class="text">(.*?)</span>)?',
|
||||||
re.DOTALL | re.IGNORECASE
|
re.DOTALL | re.IGNORECASE
|
||||||
), lambda match: ''
|
),
|
||||||
),
|
lambda match: '<h1>' + match[1] + '</h1>' +
|
||||||
(
|
(('<h4>' + match[2] + '</h4>') if match[2] else '')
|
||||||
re.compile(
|
|
||||||
r'<a class="twitter-share-button.*?</a>', re.DOTALL | re.IGNORECASE
|
|
||||||
), lambda match: ''
|
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [(u'http://bodybuilder3d.eu5.org/PrivateEyeStat.xml')]
|
# The following extra css is to tweak the formatting of various elements of various article pages.
|
||||||
|
# Unfortunately, there are a variety of different pages styles, hence the extended tweak list.
|
||||||
|
# Some of these mimic the actual layout.css which does not seem to make it across into the calibre
|
||||||
|
# ebook without duplicating it as extra css.
|
||||||
|
# However some is new css to tweak output when part of an ebook.
|
||||||
|
extra_css = ' \n '.join([
|
||||||
|
'#content img {float: right;}',
|
||||||
|
'#content img.cartoon-left {float: left;}',
|
||||||
|
'#content img.cartoon-right {float: right;}',
|
||||||
|
'#content img:first-child {float: none;}',
|
||||||
|
'#content #block-sections img {float: none;}',
|
||||||
|
'#content #block-sections img.crossword {float: none; width: 50%; margin-right: 20px;}',
|
||||||
|
'#article-caption-box {float: right; background: #222222; display: block; width: 40%; max-width: 40%;}',
|
||||||
|
'#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}',
|
||||||
|
'#whatsapp {border-left: 5px #8aba60 solid; border-right: 5px #8aba60 solid; border-bottom: 5px #8aba60 solid; padding: 0 20px 20px 20px;}',
|
||||||
|
'#whatsapp::after {clear:both;}',
|
||||||
|
'.whatsapp-left, .whatsapp-right {margin: 20px 0px 0px 0px; padding: 15px; border-radius: 10px;}',
|
||||||
|
'.whatsapp-left, .whatsapp-right {font-family: Helvetica, Arial, "sans-serif"; font-weight: 300; font-size: 18px; line-height: 24px;}',
|
||||||
|
'.whatsapp-left {text-align: left; margin-right: 30%; background-color: #eeeeee;}',
|
||||||
|
'.whatsapp-right {text-align: right; margin-left: 30%; background-color: #dce5ae;}',
|
||||||
|
'#whatsapp .whatsapp-left img, #whatsapp .whatsapp-right img {width: 35px; margin: 0 10px; vertical-align: middle;}',
|
||||||
|
])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user