mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
New version which avoids needing a 3rd party freed
I have permission by email from previous author to replace his version.
This commit is contained in:
parent
f3a8c141af
commit
a76d42342c
@ -1,43 +1,200 @@
|
||||
from functools import partial
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2017, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
private-eye.co.uk
|
||||
'''
|
||||
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, Comment, Tag, __version__ as Soup_version
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class AdvancedUserRecipe1359406781(BasicNewsRecipe):
|
||||
title = u'Private Eye'
|
||||
class PrivateEyeRecipe(BasicNewsRecipe):
|
||||
title = 'Private Eye'
|
||||
__author__ = 'Sophist at sodalis.co.uk'
|
||||
description = 'Private Eye is a fortnightly British satirical news and current affairs magazine, edited by Ian Hislop, offering a unique blend of humour, social and political observations and investigative journalism.'
|
||||
publication_type = 'magazine'
|
||||
description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop'
|
||||
language = 'en'
|
||||
encoding = 'utf-8'
|
||||
DOMAIN = 'http://www.private-eye.co.uk/'
|
||||
INDEX = DOMAIN + 'current-issue'
|
||||
oldest_article = 13
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
#remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title'}
|
||||
language = 'en_GB'
|
||||
encoding = 'cp1252'
|
||||
__author__ = u'MPritchard2k9@gmail.com'
|
||||
__copyright__ = '2014, Martyn Pritchard <MPritchard2k9@gmail.com>'
|
||||
#no_stylesheets = True
|
||||
ignore_duplicate_articles = {'url'}
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
soup = self.index_to_soup('http://www.private-eye.co.uk/current_issue.php')
|
||||
for citem in soup.findAll('img'):
|
||||
if citem['src'].endswith('big.jpg'):
|
||||
return 'http://www.private-eye.co.uk/' + citem['src']
|
||||
return cover_url
|
||||
|
||||
remove_tags_before = {'class':"article"}
|
||||
remove_tags_after = {'id' : "nav-box-sections-mobile"}
|
||||
remove_tags_after = {'class' : "gap-biggest"}
|
||||
remove_tags_after = {'id' : "subscribe-here"}
|
||||
remove_tags = [dict(name='td', attrs={'class':'sub_dave'})]
|
||||
remove_tags = [dict(name='div', attrs={'class':'footer-block'})]
|
||||
remove_tags = [dict(name='div', attrs={'class':'sub-nav-bar'})]
|
||||
remove_tags_before = [
|
||||
{'id': 'story', 'class': 'article'},
|
||||
{'id': 'page'},
|
||||
]
|
||||
remove_tags_after = [
|
||||
{'class': 'section'},
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': 'sub-nav-bar'}),
|
||||
dict(name='img', attrs={'class': 'about-covers'}),
|
||||
dict(name='div', attrs={'id': 'follow-us', 'class': 'text'}),
|
||||
dict(name='span', attrs={'class': 'section'}),
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'../grfx', re.DOTALL|re.IGNORECASE), lambda match: 'http://www.private-eye.co.uk/grfx'),
|
||||
(re.compile(r'More From This Issue.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
|
||||
(re.compile(r'More top stories in the latest issue:.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
|
||||
(re.compile(r'Also Available Online.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
|
||||
]
|
||||
|
||||
feeds = [(u'Private Eye', u'https://bodybuilder3d.000webhostapp.com/public/PrivateEyeStat.xml')]
|
||||
def fix_url(self,url):
|
||||
if (url.startswith('//') or
|
||||
url.startswith('http://') or
|
||||
url.startswith('https://')):
|
||||
return url
|
||||
if url.startswith('/'):
|
||||
url = self.DOMAIN + url[1:]
|
||||
elif url.startswith('../'):
|
||||
url = self.DOMAIN + url[3:]
|
||||
else:
|
||||
url = self.DOMAIN + url
|
||||
return url
|
||||
|
||||
urls = []
|
||||
publication_date = ""
|
||||
def add_article(self, title, url, description="", date=None):
|
||||
if date is None:
|
||||
date = self.publication_date
|
||||
if url and url not in self.urls:
|
||||
self.urls.append(url)
|
||||
self.log.info("Page added: %s: %s: %s (%s)" % (date, title, description, url))
|
||||
self.current_articles.append({
|
||||
'title': title,
|
||||
'url': url,
|
||||
'description': description,
|
||||
'date': date,
|
||||
})
|
||||
|
||||
def page_index_append(self, section):
|
||||
if self.current_articles:
|
||||
self.page_index.append((section, self.current_articles))
|
||||
self.current_articles = []
|
||||
|
||||
# Process the Index page to get the content for the ebook
|
||||
def parse_index(self):
|
||||
self.log.debug("\nSoup version: %s" % Soup_version)
|
||||
self.page_index = []
|
||||
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
# Get masthead URL
|
||||
masthead = soup.find('img', id='site-logo')
|
||||
if masthead:
|
||||
self.masthead_url = self.fix_url(masthead['src'])
|
||||
self.log.debug('Masthead found: %s' % self.masthead_url)
|
||||
else:
|
||||
self.log.warning('Masthead not found.')
|
||||
|
||||
soup = soup.find('div', id='content')
|
||||
|
||||
# Get cover image
|
||||
for img in soup.findAll('img', {'class':'current-issue'}):
|
||||
if img['src'].endswith('_big.jpg'):
|
||||
self.cover_url = self.DOMAIN + img['src']
|
||||
filename = img['src'].split('/')[-1]
|
||||
self.issue_no = filename.replace('_big.jpg', '')
|
||||
self.log.debug('Cover image found. Issue: %s' % self.issue_no)
|
||||
break
|
||||
else:
|
||||
self.log.warning('Cover image NOT found')
|
||||
|
||||
# Get publication date as 14 days before next publication date
|
||||
for tag in soup.findAll('span', {'class': 'only-smallest'}):
|
||||
tag_contents = tag.contents
|
||||
if tag_contents[0].string.lower().split()[:2] == ["next", "issue"]:
|
||||
try:
|
||||
day, month, year = tag_contents[2].split()
|
||||
day = ''.join(c for c in day if c.isdigit())
|
||||
date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y")
|
||||
date = date - timedelta(14)
|
||||
self.publication_date = datetime.strftime(date, "%d %B %Y").lstrip("0")
|
||||
self.log.debug("Publication date: %s" % self.publication_date)
|
||||
break
|
||||
except:
|
||||
self.log.warning("Invalid publication date: %s" % tag.contents[2])
|
||||
else:
|
||||
self.log.warning("Publication date not found")
|
||||
|
||||
# Online articles
|
||||
online = soup.find('div', {'id':'block-left', 'class':'article'})
|
||||
|
||||
headline = online.find('span', {'class':'headline'})
|
||||
if headline:
|
||||
current_section = headline.string
|
||||
self.log.debug('Headline found: %s' % current_section)
|
||||
else:
|
||||
current_section = 'Online Edition'
|
||||
self.log.warning('Headline not found: Default used')
|
||||
|
||||
self.current_articles = []
|
||||
title, url, descriptions = "", "", []
|
||||
for piece in online.contents:
|
||||
if isinstance(piece,Tag):
|
||||
tag_class = (piece.name, piece.get('class', ''))
|
||||
if tag_class == ('span', 'header'):
|
||||
self.page_index_append(current_section)
|
||||
current_section = piece.string
|
||||
elif tag_class == ('a','header'):
|
||||
self.add_article(title, url, r"\r\n".join(descriptions))
|
||||
title = piece.string.rstrip(u' »').strip()
|
||||
url = self.fix_url(piece.get('href', ''))
|
||||
descriptions = []
|
||||
else:
|
||||
self.add_article(title, url, r"\r\n".join(descriptions))
|
||||
title, url, descriptions = "", "", []
|
||||
else:
|
||||
desc = piece.strip(" \r\n")
|
||||
if desc:
|
||||
descriptions.append(desc)
|
||||
self.add_article(title, url, r"\r\n".join(descriptions))
|
||||
self.add_article("Number Crunching", self.DOMAIN + "number-crunching", "")
|
||||
self.page_index_append(current_section)
|
||||
|
||||
# Process More From This Issue (crossword etc.)
|
||||
current_section = ""
|
||||
self.current_articles = []
|
||||
title, url, descriptions = "", "", []
|
||||
# Remove gaps
|
||||
for gap in soup.findAll(lambda tag: tag.get('class', '').startswith('gap-')):
|
||||
gap.extract()
|
||||
# Find more items
|
||||
more = soup.find('span', {'class': 'section'})
|
||||
current_section = more.string
|
||||
more = more.findNextSibling()
|
||||
while more.name == 'div' and more.get('class', '') == 'box-contents':
|
||||
title_tag = more.find('a', {'class': 'header-home'})
|
||||
if title_tag:
|
||||
title = title_tag.string
|
||||
if not url:
|
||||
url = self.fix_url(title_tag.get('href', ''))
|
||||
desc_tag = more.find('a', {'class': 'header'})
|
||||
if desc_tag:
|
||||
descriptions.append(desc_tag.string)
|
||||
if not url:
|
||||
url = self.fix_url(desc_tag.get('href', ''))
|
||||
self.add_article(title, url, r"\r\n".join(descriptions))
|
||||
title, url, descriptions = "", "", []
|
||||
more = more.findNextSibling()
|
||||
self.page_index_append(current_section)
|
||||
|
||||
# Add the PE About Us page.
|
||||
self.add_article(
|
||||
"About Private Eye",
|
||||
self.DOMAIN + "about",
|
||||
"""Private Eye is the UK's number one best-selling news and current affairs magazine, edited by Ian Hislop.
|
||||
|
||||
It offers a unique blend of humour, social and political observations and investigative journalism. Published fortnightly, the magazine is read by over 700,000 readers and costs just £1.80 an issue.""",
|
||||
date="")
|
||||
self.page_index_append("About Private Eye")
|
||||
|
||||
return self.page_index
|
||||
|
Loading…
x
Reference in New Issue
Block a user