From bb443d01f10d4640e96cd62f3bd23177dec34723 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 2 Apr 2012 09:00:25 +0530 Subject: [PATCH] Updated OReilly Premium and Real Clear --- recipes/oreilly_premium.recipe | 325 +++++++++++++++++++++++---------- recipes/real_clear.recipe | 68 +++++-- 2 files changed, 284 insertions(+), 109 deletions(-) diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe index 9dc11059c4..4a9b9e54c3 100644 --- a/recipes/oreilly_premium.recipe +++ b/recipes/oreilly_premium.recipe @@ -1,45 +1,73 @@ -# Talking Points is not grabbing everything. -# The look is right, but only the last one added? -import re +import string, re import time +import traceback +# above for debugging via stack from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup -# strip ads and graphics -# Current Column lacks a title. -# Talking Points Memo - shorten title - Remove year and Bill's name -# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. -# Newsletters: Talking Points Memos covered by cat12 +import os, time, traceback, re, urlparse, sys, cStringIO +from collections import defaultdict +from functools import partial +from contextlib import nested, closing + + +from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed +from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending + + +# To Do: strip ads and graphics, Current Column lacks a title. +# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. +# Newsletters: Talking Points Memos covered by cat12 +# ./ebook-convert --username xxx --password xxx + +# this is derived from BasicNewsRecipe, so it can only overload those. +# Soome of what we need is otherwise in article, so we have more copy to do than otherwise. class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' __author__ = 'TMcN' - language = 'en' description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' + custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y') + title = 'Bill O\'Reilly Premium' auto_cleanup = True + conversion_options = {'linearize_tables': True} encoding = 'utf8' - needs_subscription = True + language = 'en' no_stylesheets = True - oldest_article = 20 + needs_subscription = True + oldest_article = 31 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 - max_articles_per_feed = 2000 - + max_articles_per_feed = 20 + debugMessages = True - + # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], - ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], - ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], - ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], - ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], + # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], + # ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], + # ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], + # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ] - + + feeds = [ + (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'), + (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), + (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'), + (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'), + (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5') + ] + # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day. + + # Note: Talking Points is broken in the above model; the site changed to more Ajax-y. + # Now using RSS + def get_browser(self): + print("In get_browser") br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') @@ -48,7 +76,7 @@ class OReillyPremium(BasicNewsRecipe): br['formPasswordField'] = self.password br.submit() return br - + # Returns the best-guess print url. # The second parameter (pageURL) is returned if nothing is found. def extractPrintURL(self, baseURL, pageURL, printString): @@ -62,17 +90,19 @@ class OReillyPremium(BasicNewsRecipe): tag = printText.parent tagURL = baseURL+tag['href'] return tagURL - + def stripBadChars(self, inString) : return inString.replace("\'", "") - + + def parseGeneric(self, baseURL): - # Does a generic parsing of the articles. There are six categories (0-5) + # Does a generic parsing of the articles. There are six categories (0-5) # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # NoSpin and TV are generic fullReturn = [] - for i in range(len(self.catList)) : + for i in range(len(self.catList)) : articleList = [] + print("In "+self.catList[i][0]+", index: "+ str(i)) soup = self.index_to_soup(self.catList[i][1]) # Set defaults description = 'None' @@ -80,15 +110,13 @@ class OReillyPremium(BasicNewsRecipe): # Problem: 0-2 create many in an array # 3-5 create one. # So no for-div for 3-5 - - if i < 3 : + + if i == 0 : + print("Starting TV Archives") for div in soup.findAll(self.catList[i][2], self.catList[i][3]): + print("Next DIV:") print(div) - if i == 1: - a = div.find('a', href=True) - else : - a = div - print(a) + a = div summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) @@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe): continue # url = baseURL+re.sub(r'\?.*', '', a['href']) url = baseURL+a['href'] - if i < 2 : - url = self.extractPrintURL(baseURL, url, "Print this entry") - title = self.tag_to_string(a, use_alt=True).strip() - elif i == 2 : - # Daily Briefs - url = self.extractPrintURL(baseURL, url, "Print this entry") - title = div.contents[0] - if self.debugMessages : - print(title+" @ "+url) + url = self.extractPrintURL(baseURL, url, "Print this entry") + title = self.tag_to_string(a, use_alt=True).strip() articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) - elif i == 3 : # Stratfor - a = soup.find('a', self.catList[i][3]) - if a is None : - continue - url = baseURL+a['href'] - title = self.tag_to_string(a, use_alt=True).strip() - # Get Stratfor contents so we can get the real title. - stratSoup = self.index_to_soup(url) - title = stratSoup.html.head.title.string - stratIndex = title.find('Stratfor.com:', 0) - if (stratIndex > -1) : - title = title[stratIndex+14:-1] - # Look for first blogBody 2K, it is used as the article. + + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { @@ -182,16 +191,148 @@ class OReillyPremium(BasicNewsRecipe): # 'content' : The full article (can be an empty string). This is used by FullContentProfile # } # this is used instead of BasicNewsRecipe.parse_feeds(). + # it is called by download def parse_index(self): # Parse the page into Python Soup + print("Entering recipe print_index from:") + traceback.print_stack() + print("web") baseURL = "https://www.billoreilly.com" - return self.parseGeneric(baseURL) - + masterList = self.parseGeneric(baseURL) + #print(masterList) + return masterList + def preprocess_html(self, soup): + print("In preprocess_html") refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: return soup content = refresh.get('content').partition('=')[2] raw = self.browser.open('https://www.billoreilly.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace')) + + def build_index(self): + print("In OReilly build_index()\n\n") + feedsRSS = [] + self.report_progress(0, _('Fetching feeds...')) + #try: + feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, + max_articles_per_feed=self.max_articles_per_feed, + log=self.log) + self.report_progress(0, _('Got feeds from index page')) + #except NotImplementedError: + # feeds = self.parse_feeds() + # Now add regular feeds. + feedsRSS = self.parse_feeds() + print ("feedsRSS is type "+feedsRSS.__class__.__name__) + + for articles in feedsRSS: + print("articles is type "+articles.__class__.__name__) + print("Title:" + articles.title) + feeds.append(articles) + if not feeds: + raise ValueError('No articles found, aborting') + + #feeds = FeedCollection(feeds) + + self.report_progress(0, _('Trying to download cover...')) + self.download_cover() + self.report_progress(0, _('Generating masthead...')) + self.masthead_path = None + + try: + murl = self.get_masthead_url() + except: + self.log.exception('Failed to get masthead url') + murl = None + + if murl is not None: + # Try downloading the user-supplied masthead_url + # Failure sets self.masthead_path to None + self.download_masthead(murl) + if self.masthead_path is None: + self.log.info("Synthesizing mastheadImage") + self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg') + try: + self.default_masthead_image(self.masthead_path) + except: + self.log.exception('Failed to generate default masthead image') + self.masthead_path = None + + if self.test: + feeds = feeds[:2] + self.has_single_feed = len(feeds) == 1 + + index = os.path.join(self.output_dir, 'index.html') + + html = self.feeds2index(feeds) + with open(index, 'wb') as fi: + fi.write(html) + + self.jobs = [] + + if self.reverse_article_order: + for feed in feeds: + if hasattr(feed, 'reverse'): + feed.reverse() + + self.feed_objects = feeds + for f, feed in enumerate(feeds): + feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) + if not os.path.isdir(feed_dir): + os.makedirs(feed_dir) + + for a, article in enumerate(feed): + if a >= self.max_articles_per_feed: + break + art_dir = os.path.join(feed_dir, 'article_%d'%a) + if not os.path.isdir(art_dir): + os.makedirs(art_dir) + try: + url = self.print_version(article.url) + except NotImplementedError: + url = article.url + except: + self.log.exception('Failed to find print version for: '+article.url) + url = None + if not url: + continue + func, arg = (self.fetch_embedded_article, article) \ + if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \ + else \ + ((self.fetch_obfuscated_article if self.articles_are_obfuscated \ + else self.fetch_article), url) + req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), + {}, (f, a), self.article_downloaded, + self.error_in_article_download) + req.feed = feed + req.article = article + req.feed_dir = feed_dir + self.jobs.append(req) + + + self.jobs_done = 0 + tp = ThreadPool(self.simultaneous_downloads) + for req in self.jobs: + tp.putRequest(req, block=True, timeout=0) + + + self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads) + while True: + try: + tp.poll() + time.sleep(0.1) + except NoResultsPending: + break + for f, feed in enumerate(feeds): + print("Writing feeds for "+feed.title) + html = self.feed2index(f,feeds) + feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) + with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: + fi.write(html) + self.create_opf(feeds) + self.report_progress(1, _('Feeds downloaded to %s')%index) + + return index + diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe index 19add74fcd..2dfe56d207 100644 --- a/recipes/real_clear.recipe +++ b/recipes/real_clear.recipe @@ -1,7 +1,9 @@ # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug +import string, re import time +from urlparse import urlparse from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import NavigableString +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString class RealClear(BasicNewsRecipe): title = u'Real Clear' @@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe): # Don't go down recursions = 0 max_articles_per_feed = 400 - debugMessages = False - - # Numeric parameter is type, controls whether we look for + debugMessages = True + + # Numeric parameter is type, controls whether we look for feedsets = [ - ["Politics", "http://www.realclearpolitics.com/index.xml", 0], - ["Science", "http://www.realclearscience.com/index.xml", 0], + ["Politics", "http://www.realclearpolitics.com/index.xml", 0], + ["Policy", "http://www.realclearpolicy.com/index.xml", 0], + ["Science", "http://www.realclearscience.com/index.xml", 0], ["Tech", "http://www.realcleartechnology.com/index.xml", 0], # The feedburner is essentially the same as the top feed, politics. # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1], @@ -37,22 +40,37 @@ class RealClear(BasicNewsRecipe): ] # Hints to extractPrintURL. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. - printhints = [ + phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) + + printhints = [ ["realclear", "", '' , 'printpage'], ["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''], - ["politico.com", "Print", 'a', 'share-print'], + ["politico.com", "Print", 'a', 'share-print'], ["nationalreview.com", ">Print<", 'a', ''], ["reason.com", "", 'a', 'printer'] # The following are not supported due to JavaScripting, and would require obfuscated_article to handle - # forbes, + # forbes, # usatoday - just prints with all current crap anyhow - + ] - + # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html + # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s + # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html + # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html + # Use the FULL PRINTPAGE URL; it formats it better too! + # + # NYT - try single page... + # Need special code - is it one page or several? Which URL? + # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1 + # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all + # which is at link rel="canonical" and at 0 and len(self.printhints[x][1]) == 0: + if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0: + # e.g. RealClear if self.debugMessages == True : - print("search1") + print("Search by href: "+self.printhints[x][self.phHrefSearch]) + printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch])) + elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: + if self.debugMessages == True : + print("Search 1: "+self.printhints[x][2]+" Attributes: ") + print(self.printhints[x][3]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) elif len(self.printhints[x][3])>0 : if self.debugMessages == True : print("search2") printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) else : + if self.debugMessages == True: + print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) if printFind is None: if self.debugMessages == True : print("Not Found") + # print(soup) + print("end soup\n\n"); continue + print(printFind) if isinstance(printFind, NavigableString)==False: if printFind['href'] is not None: + print("Check "+printFind['href']+" for base of "+baseURL) + if printFind['href'].find("http")!=0 : + return baseURL+printFind['href'] return printFind['href'] tag = printFind.parent print(tag) @@ -98,7 +130,7 @@ class RealClear(BasicNewsRecipe): print("In get_browser") br = BasicNewsRecipe.get_browser() return br - + def parseRSS(self, index) : if self.debugMessages == True : print("\n\nStarting "+self.feedsets[index][0]) @@ -128,7 +160,7 @@ class RealClear(BasicNewsRecipe): pubDateEl = div.find("pubDate") if pubDateEl is None : pubDateEl = div.find("pubdate") - if pubDateEl is None : + if pubDateEl is None : pubDate = time.strftime('%a, %d %b') else : pubDate = pubDateEl.contents[0] @@ -144,7 +176,7 @@ class RealClear(BasicNewsRecipe): pubdate = time.strftime('%a, %d %b') articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) return articleList - + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { @@ -157,7 +189,8 @@ class RealClear(BasicNewsRecipe): # this is used instead of BasicNewsRecipe.parse_feeds(). def parse_index(self): # Parse the page into Python Soup - + + articleList = [] ans = [] feedsCount = len(self.feedsets) for x in range(0,feedsCount): # should be ,4 @@ -167,4 +200,5 @@ class RealClear(BasicNewsRecipe): if self.debugMessages == True : print(ans) return ans +