Update Private Eye

Merge branch 'patch-4' of https://github.com/Sophist-UK/calibre
This commit is contained in:
Kovid Goyal 2017-05-07 18:28:15 +05:30
commit 762d35b054
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,43 +1,220 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
'''
private-eye.co.uk
'''
import re import re
from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import Comment, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1359406781(BasicNewsRecipe): class PrivateEyeRecipe(BasicNewsRecipe):
title = u'Private Eye' title = 'Private Eye'
__author__ = 'Sophist at sodalis.co.uk'
description = 'Private Eye is a fortnightly British satirical news and current affairs magazine, edited by Ian Hislop, offering a unique blend of humour, social and political observations and investigative journalism.' # noqa
publication_type = 'magazine' publication_type = 'magazine'
description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' language = 'en'
encoding = 'utf-8'
DOMAIN = 'http://www.private-eye.co.uk/'
INDEX = DOMAIN + 'current-issue'
oldest_article = 13 oldest_article = 13
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_empty_feeds = True # remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True # no_stylesheets = True
ignore_duplicate_articles = {'title'} ignore_duplicate_articles = {'url'}
language = 'en_GB'
encoding = 'cp1252'
__author__ = u'MPritchard2k9@gmail.com'
__copyright__ = '2014, Martyn Pritchard <MPritchard2k9@gmail.com>'
def get_cover_url(self): remove_tags_before = [
cover_url = None {
soup = self.index_to_soup('http://www.private-eye.co.uk/current_issue.php') 'id': 'story',
for citem in soup.findAll('img'): 'class': 'article'
if citem['src'].endswith('big.jpg'): },
return 'http://www.private-eye.co.uk/' + citem['src'] {
return cover_url 'id': 'page'
},
remove_tags_before = {'class':"article"} ]
remove_tags_after = {'id' : "nav-box-sections-mobile"} remove_tags_after = [
remove_tags_after = {'class' : "gap-biggest"} {
remove_tags_after = {'id' : "subscribe-here"} 'class': 'section'
remove_tags = [dict(name='td', attrs={'class':'sub_dave'})] },
remove_tags = [dict(name='div', attrs={'class':'footer-block'})] ]
remove_tags = [dict(name='div', attrs={'class':'sub-nav-bar'})] remove_tags = [
dict(name='div', attrs={'class': 'sub-nav-bar'}),
preprocess_regexps = [ dict(name='img', attrs={'class': 'about-covers'}),
(re.compile(r'../grfx', re.DOTALL|re.IGNORECASE), lambda match: 'http://www.private-eye.co.uk/grfx'), dict(name='div', attrs={'id': 'follow-us',
(re.compile(r'More From This Issue.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), 'class': 'text'}),
(re.compile(r'More top stories in the latest issue:.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), dict(name='span', attrs={'class': 'section'}),
(re.compile(r'Also Available Online.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
] ]
feeds = [(u'Private Eye', u'https://bodybuilder3d.000webhostapp.com/public/PrivateEyeStat.xml')] preprocess_regexps = [
(
re.compile(r'../grfx', re.DOTALL | re.IGNORECASE),
lambda match: 'http://www.private-eye.co.uk/grfx'
),
]
def fix_url(self, url):
if (url.startswith('//') or
url.startswith('http://') or
url.startswith('https://')):
return url
if url.startswith('/'):
url = self.DOMAIN + url[1:]
elif url.startswith('../'):
url = self.DOMAIN + url[3:]
else:
url = self.DOMAIN + url
return url
urls = []
publication_date = ""
def add_article(self, title, url, description="", date=None):
if date is None:
date = self.publication_date
if url and url not in self.urls:
self.urls.append(url)
self.log.info(
"Page added: %s: %s: %s (%s)" % (date, title, description, url)
)
self.current_articles.append({
'title': title,
'url': url,
'description': description,
'date': date,
})
def page_index_append(self, section):
if self.current_articles:
self.page_index.append((section, self.current_articles))
self.current_articles = []
# Process the Index page to get the content for the ebook
def parse_index(self):
self.page_index = []
soup = self.index_to_soup(self.INDEX)
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
# Get masthead URL
masthead = soup.find('img', id='site-logo')
if masthead:
self.masthead_url = self.fix_url(masthead['src'])
self.log.debug('Masthead found: %s' % self.masthead_url)
else:
self.log.warning('Masthead not found.')
soup = soup.find('div', id='content')
# Get cover image
for img in soup.findAll('img', {'class': 'current-issue'}):
if img['src'].endswith('_big.jpg'):
self.cover_url = self.DOMAIN + img['src']
filename = img['src'].split('/')[-1]
self.issue_no = filename.replace('_big.jpg', '')
self.log.debug('Cover image found. Issue: %s' % self.issue_no)
break
else:
self.log.warning('Cover image NOT found')
# Get publication date as 14 days before next publication date
for tag in soup.findAll('span', {'class': 'only-smallest'}):
tag_contents = tag.contents
if tag_contents[0].string.lower().split()[:2] == ["next", "issue"]:
try:
day, month, year = tag_contents[2].split()
day = ''.join(c for c in day if c.isdigit())
date = datetime.strptime(
" ".join((day, month, year)), "%d %B %Y"
)
date = date - timedelta(14)
self.publication_date = datetime.strftime(date, "%d %B %Y"
).lstrip("0")
self.log.debug("Publication date: %s" % self.publication_date)
break
except:
self.log.warning(
"Invalid publication date: %s" % tag.contents[2]
)
else:
self.log.warning("Publication date not found")
# Online articles
online = soup.find('div', {'id': 'block-left', 'class': 'article'})
headline = online.find('span', {'class': 'headline'})
if headline:
current_section = headline.string
self.log.debug('Headline found: %s' % current_section)
else:
current_section = 'Online Edition'
self.log.warning('Headline not found: Default used')
self.current_articles = []
title, url, descriptions = "", "", []
for piece in online.contents:
if isinstance(piece, Tag):
tag_class = (piece.name, piece.get('class', ''))
if tag_class == ('span', 'header'):
self.page_index_append(current_section)
current_section = piece.string
elif tag_class == ('a', 'header'):
self.add_article(title, url, r"\r\n".join(descriptions))
title = piece.string.rstrip(u' »').strip()
url = self.fix_url(piece.get('href', ''))
descriptions = []
else:
self.add_article(title, url, r"\r\n".join(descriptions))
title, url, descriptions = "", "", []
else:
desc = piece.strip(" \r\n")
if desc:
descriptions.append(desc)
self.add_article(title, url, r"\r\n".join(descriptions))
self.add_article("Number Crunching", self.DOMAIN + "number-crunching", "")
self.page_index_append(current_section)
# Process More From This Issue (crossword etc.)
current_section = ""
self.current_articles = []
title, url, descriptions = "", "", []
# Remove gaps
for gap in soup.findAll(lambda tag: tag.get('class', '').startswith('gap-')):
gap.extract()
# Find more items
more = soup.find('span', {'class': 'section'})
current_section = more.string
more = more.findNextSibling()
while more.name == 'div' and more.get('class', '') == 'box-contents':
title_tag = more.find('a', {'class': 'header-home'})
if title_tag:
title = title_tag.string
if not url:
url = self.fix_url(title_tag.get('href', ''))
desc_tag = more.find('a', {'class': 'header'})
if desc_tag:
descriptions.append(desc_tag.string)
if not url:
url = self.fix_url(desc_tag.get('href', ''))
self.add_article(title, url, r"\r\n".join(descriptions))
title, url, descriptions = "", "", []
more = more.findNextSibling()
self.page_index_append(current_section)
# Add the PE About Us page.
self.add_article(
"About Private Eye",
self.DOMAIN + "about",
"""Private Eye is the UK's number one best-selling news and current affairs magazine, edited by Ian Hislop.
It offers a unique blend of humour, social and political observations and investigative journalism.
Published fortnightly, the magazine is read by over 700,000 readers and costs just £1.80 an issue.""",
date=""
)
self.page_index_append("About Private Eye")
return self.page_index