diff --git a/recipes/private_eye.recipe b/recipes/private_eye.recipe index bbce44f129..9cfcb77aa8 100644 --- a/recipes/private_eye.recipe +++ b/recipes/private_eye.recipe @@ -1,49 +1,127 @@ +''' +Fetch Private Eye (Online Edition) +''' + import re from calibre.web.feeds.news import BasicNewsRecipe +class PrivateEyeRecipe(BasicNewsRecipe): + ## + # Last Edited: 2023-07-01 + # + # Remark: Version 3.0 + # Rewrite (by Sophist-UK) to fix bugs, fit latest web pages, + # correctly identify pages to include and improve formatting. + # -class AdvancedUserRecipe1359406781(BasicNewsRecipe): - title = u'Private Eye' - publication_type = 'magazine' + title = u'Private Eye (Online Edition)' description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' - oldest_article = 13 - max_articles_per_feed = 100 - remove_empty_feeds = True - remove_javascript = True - no_stylesheets = True - ignore_duplicate_articles = {'title'} + publication_type = 'magazine' language = 'en_GB' encoding = 'utf-8' - __author__ = u'Martyn Pritchard' - __copyright__ = '2020, Martyn Pritchard ' + oldest_article = 13 + max_articles_per_feed = 100 + remove_javascript = True + ignore_duplicate_articles = {'url'} + + __author__ = u'Martyn Pritchard & Sophist-UK' + __copyright__ = '2020, Martyn Pritchard & Sophist-UK ' + + current_issue = 'https://www.private-eye.co.uk/current-issue' + masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png' def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('https://www.private-eye.co.uk') + soup = self.index_to_soup(self.current_issue) for citem in soup.findAll('img'): if citem['src'].endswith('big.jpg'): return citem['src'] - return cover_url + return None - remove_tags_before = {'class': "article"} - remove_tags_after = {'class': "article"} - remove_tags = [dict(name='div', attrs={'id': 'sections-sidebar'})] - remove_tags = {'class': "sub-nav-bar"} - remove_tags = [dict(name='a', attrs={'class': 'twitter-share-button'})] - remove_tags = [dict(name='div', attrs={'id': 'nav-box-sections-mobile'})] + def parse_index(self): + soup = self.index_to_soup(self.current_issue) + key = None + articles = [] + + # Get pages first from the sub-menu, and then from the contents panel. + # Duplicates will be eliminated automatically. + for menu_attrs in ( + {'class': 'sub-nav-bar', 'id':'sub-nav-box'}, + {'class': 'article', 'id': 'block-left'}, + ): + menu = soup.find('div', attrs=menu_attrs) + + if not menu: + continue + + for a in menu.findAll('a', href=True): + title = a.getText().rstrip(' ยป\n') + if not title: + continue + articles.append({ + 'title': title, + 'url': a.get('href'), + }) + + if not articles: + abort_recipe_processing('Private-Eye Online index of pages not found'); + + index = [('Private Eye', articles)] + + self.log('parse_index', index) + + return index + + + remove_tags_before = remove_tags_after = [ + {'name': 'div', 'class': "article"}, + {'name': 'div', 'id': "page"}, + {'name': 'div', 'id': "page-wide"}, + {'name': 'div', 'id': "content"}, + ] + remove_tags = [ + {'name': 'div', 'attrs': {'id': 'top-bar'}}, + {'name': 'div', 'attrs': {'id': 'header-wide'}}, + {'name': 'div', 'attrs': {'id': 'footer-wide'}}, + {'name': 'div', 'attrs': {'id': 'follow-buttons'}}, + {'name': 'div', 'attrs': {'id': 'sidebar'}}, + {'name': 'div', 'attrs': {'id': 'sections-sidebar'}}, + {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}}, + {'name': 'iframe'}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}}, + ] + + # Convert headers to h1, strapline to h4 preprocess_regexps = [ ( re.compile( - r'(.*?)', re.DOTALL | re.IGNORECASE - ), lambda match: '' + ), lambda match: '

' + match[0] + '

' ), ( re.compile( - r'