New version which avoids needing a 3rd party freed

I have permission by email from previous author to replace his version.
2025-11-25 15:55:02 -05:00 · 2017-05-07 13:23:39 +01:00 · 2017-05-07 13:23:39 +01:00 · a76d42342c
commit a76d42342c
parent f3a8c141af
1 changed files with 185 additions and 28 deletions
--- a/recipes/private_eye.recipe
+++ b/recipes/private_eye.recipe
@ -1,43 +1,200 @@
+from functools import partial
+__license__ = 'GPL v3'
+__copyright__ = '2017, Kovid Goyal <kovid at kovidgoyal.net>'
+'''
+private-eye.co.uk
+'''
+
 import re
+from datetime import datetime, timedelta
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, Comment, Tag, __version__ as Soup_version
 from calibre.web.feeds.news import BasicNewsRecipe


-class AdvancedUserRecipe1359406781(BasicNewsRecipe):
-    title          = u'Private Eye'
+class PrivateEyeRecipe(BasicNewsRecipe):
+    title = 'Private Eye'
+    __author__ = 'Sophist at sodalis.co.uk'
+    description = 'Private Eye is a fortnightly British satirical news and current affairs magazine, edited by Ian Hislop, offering a unique blend of humour, social and political observations and investigative journalism.'
    publication_type = 'magazine'
-    description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop'
+    language = 'en'
+    encoding = 'utf-8'
+    DOMAIN = 'http://www.private-eye.co.uk/'
+    INDEX = DOMAIN + 'current-issue'
    oldest_article = 13
    max_articles_per_feed = 100
-    remove_empty_feeds = True
+    #remove_empty_feeds = True
    remove_javascript     = True
-    no_stylesheets = True
-    ignore_duplicate_articles = {'title'}
-    language = 'en_GB'
-    encoding = 'cp1252'
-    __author__ = u'MPritchard2k9@gmail.com'
-    __copyright__ = '2014, Martyn Pritchard <MPritchard2k9@gmail.com>'
+    #no_stylesheets = True
+    ignore_duplicate_articles = {'url'}

-    def get_cover_url(self):
-        cover_url = None
-        soup = self.index_to_soup('http://www.private-eye.co.uk/current_issue.php')
-        for citem in soup.findAll('img'):
-            if citem['src'].endswith('big.jpg'):
-                return 'http://www.private-eye.co.uk/' + citem['src']
-        return cover_url

-    remove_tags_before = {'class':"article"}
-    remove_tags_after  = {'id' : "nav-box-sections-mobile"}
-    remove_tags_after  = {'class' : "gap-biggest"}
-    remove_tags_after  = {'id' : "subscribe-here"}
-    remove_tags = [dict(name='td', attrs={'class':'sub_dave'})]
-    remove_tags = [dict(name='div', attrs={'class':'footer-block'})]
-    remove_tags = [dict(name='div', attrs={'class':'sub-nav-bar'})]
+    remove_tags_before = [
+        {'id': 'story', 'class': 'article'},
+        {'id': 'page'},
+        ]
+    remove_tags_after  = [
+        {'class': 'section'},
+        ]
+    remove_tags = [
+        dict(name='div', attrs={'class': 'sub-nav-bar'}),
+        dict(name='img', attrs={'class': 'about-covers'}),
+        dict(name='div', attrs={'id': 'follow-us', 'class': 'text'}),
+        dict(name='span', attrs={'class': 'section'}),
+        ]

    preprocess_regexps = [
                   (re.compile(r'../grfx', re.DOTALL|re.IGNORECASE), lambda match: 'http://www.private-eye.co.uk/grfx'),
-                   (re.compile(r'More From This Issue.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
-                   (re.compile(r'More top stories in the latest issue:.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
-                   (re.compile(r'Also Available Online.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
    ]

-    feeds          = [(u'Private Eye', u'https://bodybuilder3d.000webhostapp.com/public/PrivateEyeStat.xml')]
+    def fix_url(self,url):
+        if (url.startswith('//') or
+            url.startswith('http://') or
+            url.startswith('https://')):
+            return url
+        if url.startswith('/'):
+            url = self.DOMAIN + url[1:]
+        elif url.startswith('../'):
+            url = self.DOMAIN + url[3:]
+        else:
+            url = self.DOMAIN + url
+        return url
+
+    urls = []
+    publication_date = ""
+    def add_article(self, title, url, description="", date=None):
+        if date is None:
+            date = self.publication_date
+        if url and url not in self.urls:
+            self.urls.append(url)
+            self.log.info("Page added: %s: %s: %s (%s)" % (date, title, description, url))
+            self.current_articles.append({
+                'title': title,
+                'url': url,
+                'description': description,
+                'date': date,
+            })
+
+    def page_index_append(self, section):
+        if self.current_articles:
+            self.page_index.append((section, self.current_articles))
+            self.current_articles = []
+
+    # Process the Index page to get the content for the ebook
+    def parse_index(self):
+        self.log.debug("\nSoup version: %s" % Soup_version)
+        self.page_index = []
+
+        soup = self.index_to_soup(self.INDEX)
+        for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
+            comment.extract()
+        # Get masthead URL
+        masthead = soup.find('img', id='site-logo')
+        if masthead:
+            self.masthead_url = self.fix_url(masthead['src'])
+            self.log.debug('Masthead found: %s' % self.masthead_url)
+        else:
+            self.log.warning('Masthead not found.')
+
+        soup = soup.find('div', id='content')
+
+        # Get cover image
+        for img in soup.findAll('img', {'class':'current-issue'}):
+            if img['src'].endswith('_big.jpg'):
+                self.cover_url = self.DOMAIN + img['src']
+                filename = img['src'].split('/')[-1]
+                self.issue_no = filename.replace('_big.jpg', '')
+                self.log.debug('Cover image found. Issue: %s' % self.issue_no)
+                break
+        else:
+            self.log.warning('Cover image NOT found')
+
+        # Get publication date as 14 days before next publication date
+        for tag in soup.findAll('span', {'class': 'only-smallest'}):
+            tag_contents = tag.contents
+            if tag_contents[0].string.lower().split()[:2] == ["next", "issue"]:
+                try:
+                    day, month, year = tag_contents[2].split()
+                    day = ''.join(c for c in day if c.isdigit())
+                    date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y")
+                    date = date - timedelta(14)
+                    self.publication_date = datetime.strftime(date, "%d %B %Y").lstrip("0")
+                    self.log.debug("Publication date: %s" % self.publication_date)
+                    break
+                except:
+                    self.log.warning("Invalid publication date: %s" % tag.contents[2])
+        else:
+            self.log.warning("Publication date not found")
+
+        # Online articles
+        online = soup.find('div', {'id':'block-left', 'class':'article'})
+
+        headline = online.find('span', {'class':'headline'})
+        if headline:
+            current_section = headline.string
+            self.log.debug('Headline found: %s' % current_section)
+        else:
+            current_section = 'Online Edition'
+            self.log.warning('Headline not found: Default used')
+
+        self.current_articles = []
+        title, url, descriptions = "", "", []
+        for piece in online.contents:
+            if isinstance(piece,Tag):
+                tag_class = (piece.name, piece.get('class', ''))
+                if tag_class == ('span', 'header'):
+                    self.page_index_append(current_section)
+                    current_section = piece.string
+                elif tag_class == ('a','header'):
+                    self.add_article(title, url, r"\r\n".join(descriptions))
+                    title = piece.string.rstrip(u' »').strip()
+                    url = self.fix_url(piece.get('href', ''))
+                    descriptions = []
+                else:
+                    self.add_article(title, url, r"\r\n".join(descriptions))
+                    title, url, descriptions = "", "", []
+            else:
+                desc = piece.strip(" \r\n")
+                if desc:
+                    descriptions.append(desc)
+        self.add_article(title, url, r"\r\n".join(descriptions))
+        self.add_article("Number Crunching", self.DOMAIN + "number-crunching", "")
+        self.page_index_append(current_section)
+
+        # Process More From This Issue (crossword etc.)
+        current_section = ""
+        self.current_articles = []
+        title, url, descriptions = "", "", []
+        # Remove gaps
+        for gap in soup.findAll(lambda tag: tag.get('class', '').startswith('gap-')):
+            gap.extract()
+        # Find more items
+        more = soup.find('span', {'class': 'section'})
+        current_section = more.string
+        more = more.findNextSibling()
+        while more.name == 'div' and more.get('class', '') == 'box-contents':
+            title_tag = more.find('a', {'class': 'header-home'})
+            if title_tag:
+                title = title_tag.string
+                if not url:
+                    url = self.fix_url(title_tag.get('href', ''))
+            desc_tag = more.find('a', {'class': 'header'})
+            if desc_tag:
+                descriptions.append(desc_tag.string)
+                if not url:
+                    url = self.fix_url(desc_tag.get('href', ''))
+            self.add_article(title, url, r"\r\n".join(descriptions))
+            title, url, descriptions = "", "", []
+            more = more.findNextSibling()
+        self.page_index_append(current_section)
+
+        # Add the PE About Us page.
+        self.add_article(
+            "About Private Eye",
+            self.DOMAIN + "about",
+            """Private Eye is the UK's number one best-selling news and current affairs magazine, edited by Ian Hislop.
+
+It offers a unique blend of humour, social and political observations and investigative journalism. Published fortnightly, the magazine is read by over 700,000 readers and costs just £1.80 an issue.""",
+            date="")
+        self.page_index_append("About Private Eye")
+
+        return self.page_index