diff --git a/recipes/private_eye.recipe b/recipes/private_eye.recipe index 90e7e44f74..f31678be53 100644 --- a/recipes/private_eye.recipe +++ b/recipes/private_eye.recipe @@ -1,43 +1,200 @@ +from functools import partial +__license__ = 'GPL v3' +__copyright__ = '2017, Kovid Goyal ' +''' +private-eye.co.uk +''' + import re +from datetime import datetime, timedelta +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, Comment, Tag, __version__ as Soup_version from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1359406781(BasicNewsRecipe): - title = u'Private Eye' +class PrivateEyeRecipe(BasicNewsRecipe): + title = 'Private Eye' + __author__ = 'Sophist at sodalis.co.uk' + description = 'Private Eye is a fortnightly British satirical news and current affairs magazine, edited by Ian Hislop, offering a unique blend of humour, social and political observations and investigative journalism.' publication_type = 'magazine' - description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' + language = 'en' + encoding = 'utf-8' + DOMAIN = 'http://www.private-eye.co.uk/' + INDEX = DOMAIN + 'current-issue' oldest_article = 13 max_articles_per_feed = 100 - remove_empty_feeds = True + #remove_empty_feeds = True remove_javascript = True - no_stylesheets = True - ignore_duplicate_articles = {'title'} - language = 'en_GB' - encoding = 'cp1252' - __author__ = u'MPritchard2k9@gmail.com' - __copyright__ = '2014, Martyn Pritchard ' + #no_stylesheets = True + ignore_duplicate_articles = {'url'} - def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.private-eye.co.uk/current_issue.php') - for citem in soup.findAll('img'): - if citem['src'].endswith('big.jpg'): - return 'http://www.private-eye.co.uk/' + citem['src'] - return cover_url - remove_tags_before = {'class':"article"} - remove_tags_after = {'id' : "nav-box-sections-mobile"} - remove_tags_after = {'class' : "gap-biggest"} - remove_tags_after = {'id' : "subscribe-here"} - remove_tags = [dict(name='td', attrs={'class':'sub_dave'})] - remove_tags = [dict(name='div', attrs={'class':'footer-block'})] - remove_tags = [dict(name='div', attrs={'class':'sub-nav-bar'})] + remove_tags_before = [ + {'id': 'story', 'class': 'article'}, + {'id': 'page'}, + ] + remove_tags_after = [ + {'class': 'section'}, + ] + remove_tags = [ + dict(name='div', attrs={'class': 'sub-nav-bar'}), + dict(name='img', attrs={'class': 'about-covers'}), + dict(name='div', attrs={'id': 'follow-us', 'class': 'text'}), + dict(name='span', attrs={'class': 'section'}), + ] preprocess_regexps = [ (re.compile(r'../grfx', re.DOTALL|re.IGNORECASE), lambda match: 'http://www.private-eye.co.uk/grfx'), - (re.compile(r'More From This Issue.*', re.DOTALL|re.IGNORECASE), lambda match: ''), - (re.compile(r'More top stories in the latest issue:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), - (re.compile(r'Also Available Online.*', re.DOTALL|re.IGNORECASE), lambda match: ''), ] - feeds = [(u'Private Eye', u'https://bodybuilder3d.000webhostapp.com/public/PrivateEyeStat.xml')] + def fix_url(self,url): + if (url.startswith('//') or + url.startswith('http://') or + url.startswith('https://')): + return url + if url.startswith('/'): + url = self.DOMAIN + url[1:] + elif url.startswith('../'): + url = self.DOMAIN + url[3:] + else: + url = self.DOMAIN + url + return url + + urls = [] + publication_date = "" + def add_article(self, title, url, description="", date=None): + if date is None: + date = self.publication_date + if url and url not in self.urls: + self.urls.append(url) + self.log.info("Page added: %s: %s: %s (%s)" % (date, title, description, url)) + self.current_articles.append({ + 'title': title, + 'url': url, + 'description': description, + 'date': date, + }) + + def page_index_append(self, section): + if self.current_articles: + self.page_index.append((section, self.current_articles)) + self.current_articles = [] + + # Process the Index page to get the content for the ebook + def parse_index(self): + self.log.debug("\nSoup version: %s" % Soup_version) + self.page_index = [] + + soup = self.index_to_soup(self.INDEX) + for comment in soup.findAll(text=lambda text:isinstance(text, Comment)): + comment.extract() + # Get masthead URL + masthead = soup.find('img', id='site-logo') + if masthead: + self.masthead_url = self.fix_url(masthead['src']) + self.log.debug('Masthead found: %s' % self.masthead_url) + else: + self.log.warning('Masthead not found.') + + soup = soup.find('div', id='content') + + # Get cover image + for img in soup.findAll('img', {'class':'current-issue'}): + if img['src'].endswith('_big.jpg'): + self.cover_url = self.DOMAIN + img['src'] + filename = img['src'].split('/')[-1] + self.issue_no = filename.replace('_big.jpg', '') + self.log.debug('Cover image found. Issue: %s' % self.issue_no) + break + else: + self.log.warning('Cover image NOT found') + + # Get publication date as 14 days before next publication date + for tag in soup.findAll('span', {'class': 'only-smallest'}): + tag_contents = tag.contents + if tag_contents[0].string.lower().split()[:2] == ["next", "issue"]: + try: + day, month, year = tag_contents[2].split() + day = ''.join(c for c in day if c.isdigit()) + date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y") + date = date - timedelta(14) + self.publication_date = datetime.strftime(date, "%d %B %Y").lstrip("0") + self.log.debug("Publication date: %s" % self.publication_date) + break + except: + self.log.warning("Invalid publication date: %s" % tag.contents[2]) + else: + self.log.warning("Publication date not found") + + # Online articles + online = soup.find('div', {'id':'block-left', 'class':'article'}) + + headline = online.find('span', {'class':'headline'}) + if headline: + current_section = headline.string + self.log.debug('Headline found: %s' % current_section) + else: + current_section = 'Online Edition' + self.log.warning('Headline not found: Default used') + + self.current_articles = [] + title, url, descriptions = "", "", [] + for piece in online.contents: + if isinstance(piece,Tag): + tag_class = (piece.name, piece.get('class', '')) + if tag_class == ('span', 'header'): + self.page_index_append(current_section) + current_section = piece.string + elif tag_class == ('a','header'): + self.add_article(title, url, r"\r\n".join(descriptions)) + title = piece.string.rstrip(u' »').strip() + url = self.fix_url(piece.get('href', '')) + descriptions = [] + else: + self.add_article(title, url, r"\r\n".join(descriptions)) + title, url, descriptions = "", "", [] + else: + desc = piece.strip(" \r\n") + if desc: + descriptions.append(desc) + self.add_article(title, url, r"\r\n".join(descriptions)) + self.add_article("Number Crunching", self.DOMAIN + "number-crunching", "") + self.page_index_append(current_section) + + # Process More From This Issue (crossword etc.) + current_section = "" + self.current_articles = [] + title, url, descriptions = "", "", [] + # Remove gaps + for gap in soup.findAll(lambda tag: tag.get('class', '').startswith('gap-')): + gap.extract() + # Find more items + more = soup.find('span', {'class': 'section'}) + current_section = more.string + more = more.findNextSibling() + while more.name == 'div' and more.get('class', '') == 'box-contents': + title_tag = more.find('a', {'class': 'header-home'}) + if title_tag: + title = title_tag.string + if not url: + url = self.fix_url(title_tag.get('href', '')) + desc_tag = more.find('a', {'class': 'header'}) + if desc_tag: + descriptions.append(desc_tag.string) + if not url: + url = self.fix_url(desc_tag.get('href', '')) + self.add_article(title, url, r"\r\n".join(descriptions)) + title, url, descriptions = "", "", [] + more = more.findNextSibling() + self.page_index_append(current_section) + + # Add the PE About Us page. + self.add_article( + "About Private Eye", + self.DOMAIN + "about", + """Private Eye is the UK's number one best-selling news and current affairs magazine, edited by Ian Hislop. + +It offers a unique blend of humour, social and political observations and investigative journalism. Published fortnightly, the magazine is read by over 700,000 readers and costs just £1.80 an issue.""", + date="") + self.page_index_append("About Private Eye") + + return self.page_index