From 554a3519667e1129af2b3b663b5b95cbdbe3c4ee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Jan 2012 08:58:59 +0530 Subject: [PATCH] OReilly Premium by TechnoCat --- recipes/oreilly_premium.recipe | 263 +++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 recipes/oreilly_premium.recipe diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe new file mode 100644 index 0000000000..c5a615900c --- /dev/null +++ b/recipes/oreilly_premium.recipe @@ -0,0 +1,263 @@ +import re +import time +from calibre.web.feeds.recipes import BasicNewsRecipe +# Allows the Python soup converter, which makes parsing easier. +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class OReillyPremium(BasicNewsRecipe): + title = u'OReilly Premium' + __author__ = 'TMcN' + description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' + cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' + auto_cleanup = True + encoding = 'utf8' + needs_subscription = True + no_stylesheets = True + oldest_article = 20 + remove_javascript = True + remove_tags = [dict(name='img', attrs={})] + # Don't go down + recursions = 0 + max_articles_per_feed = 2000 + language = 'en' + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') + br.select_form(name='login') + br['formEmailField'] = self.username + br['formPasswordField'] = self.password + br.submit() + return br + + def extractPrintURL(self, baseURL, pageURL, printString): + tagURL = pageURL + soup = self.index_to_soup(pageURL) + if soup : + printText = soup.find('a', text=printString) + else : + print("Failed to find Print string "+printString+ " in "+pageURL) + + if printText: + tag = printText.parent + tagURL = baseURL+tag['href'] + return tagURL + + def stripBadChars(self, inString) : + return inString.replace("\'", "") + + + # returns a qualifying article list + def parseNoSpinArchives(self, baseURL, soupURL, debugMessages): + articleList = [] + soup = self.index_to_soup(soupURL) + for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}): + a = div.find('a', href=True) + if not a: + continue + # re == regex. [href] is the link + url = baseURL + url +=re.sub(r'\?.*', '', a['href']) + # Get print version + printURL = self.extractPrintURL(baseURL, url, "Print this entry") + if printURL: + url = printURL + title = self.tag_to_string(a, use_alt=True).strip() + if debugMessages : + print("No Spin Archive Title:"+title+" at url: "+url) + description = 'None' + pubdate = time.strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) + return articleList + + + def parseTVArchives(self, baseURL, soupURL, debugMessages): + # TV Archives page has some Ajax, so look for the static only. + articleList = [] + soup = self.index_to_soup(soupURL) + if debugMessages : + print("In parseTVArchives") + for div in soup.findAll('a', {'class':['showLinks','homeLinks']}): + a = div + url = baseURL + url +=a['href'] + printURL = self.extractPrintURL(baseURL, url, "Print this entry") + if printURL: + url = printURL + title = self.tag_to_string(a, use_alt=True).strip() + title = self.stripBadChars(title) + if debugMessages : + print("TV Archive "+title+" at url: "+url) + description = 'None' + pubdate = time.strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) + if debugMessages : + print("Leaving TV Parse ") + return articleList + + # Get Daily Briefing Archives + def parseDailyBriefs(self, baseURL, soupURL, debugMessages) : + print("Starting daily briefs") + articleList = [] + soup = self.index_to_soup(soupURL) + for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}): + # re == regex. [href] is the link + url = baseURL + url +=re.sub(r'\?.*', '', div['href']) + printURL = self.extractPrintURL(baseURL, url, "Print this entry") + if printURL: + url = printURL + title = div.contents[0] + if debugMessages : + print("Daily Brief - title:"+title+" at url: "+url) + description = 'None' + pubdate = time.strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) + print("Leaving daily briefs") + return articleList + + # Get the weekly Stratfor intelligence report + def parseStratfor(self, baseURL, soupURL, debugMessages): + # http://www.billoreilly.com/blog?categoryID=5 + articleList = [] + soup = self.index_to_soup(soupURL) + if debugMessages : + print("In parseStratfor") + a = soup.find('a', {'class':['blogLinks']}) + url = baseURL + url +=a['href'] + title = self.tag_to_string(a, use_alt=True).strip() + if debugMessages : + print("url: "+url) + print("title:"+title) + # Get Stratfor contents so we can get the real title. + stratSoup = self.index_to_soup(url) + title = stratSoup.html.head.title.string + stratIndex = title.find('Stratfor.com:', 0) + if (stratIndex > -1) : + title = title[stratIndex+14:-1] + # Look for first blogBody