This commit is contained in:
Kovid Goyal 2012-04-06 09:00:37 +05:30
parent 768928a5d3
commit 921769bd6a
3 changed files with 50 additions and 56 deletions

View File

@ -4,10 +4,8 @@ __copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
www.buenosairesherald.com www.buenosairesherald.com
''' '''
import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class BuenosAiresHerald(BasicNewsRecipe): class BuenosAiresHerald(BasicNewsRecipe):
title = 'Buenos Aires Herald' title = 'Buenos Aires Herald'
@ -62,7 +60,7 @@ class BuenosAiresHerald(BasicNewsRecipe):
lfeeds = self.get_feeds() lfeeds = self.get_feeds()
for feedobj in lfeeds: for feedobj in lfeeds:
feedtitle, feedurl = feedobj feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = [] articles = []
soup = self.index_to_soup(feedurl) soup = self.index_to_soup(feedurl)
for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}): for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}):

View File

@ -1,4 +1,3 @@
import string, re
import time import time
import traceback import traceback
# above for debugging via stack # above for debugging via stack
@ -6,22 +5,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier. # Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
import os, time, traceback, re, urlparse, sys, cStringIO import os
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.feeds import feeds_from_index
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
# To Do: strip ads and graphics, Current Column lacks a title. # To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12 # Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert --username xxx --password xxx # ./ebook-convert --username xxx --password xxx
# this is derived from BasicNewsRecipe, so it can only overload those. # this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise. # Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe): class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium' title = u'OReilly Premium'
@ -42,9 +38,9 @@ class OReillyPremium(BasicNewsRecipe):
# Don't go down # Don't go down
recursions = 0 recursions = 0
max_articles_per_feed = 20 max_articles_per_feed = 20
debugMessages = True debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
# ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
@ -53,19 +49,19 @@ class OReillyPremium(BasicNewsRecipe):
# ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
] ]
feeds = [ feeds = [
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'), (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'), (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'), (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5') (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
] ]
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day. # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y. # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
# Now using RSS # Now using RSS
def get_browser(self): def get_browser(self):
print("In get_browser") print("In get_browser")
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
@ -76,7 +72,7 @@ class OReillyPremium(BasicNewsRecipe):
br['formPasswordField'] = self.password br['formPasswordField'] = self.password
br.submit() br.submit()
return br return br
# Returns the best-guess print url. # Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found. # The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, baseURL, pageURL, printString): def extractPrintURL(self, baseURL, pageURL, printString):
@ -90,17 +86,17 @@ class OReillyPremium(BasicNewsRecipe):
tag = printText.parent tag = printText.parent
tagURL = baseURL+tag['href'] tagURL = baseURL+tag['href']
return tagURL return tagURL
def stripBadChars(self, inString) : def stripBadChars(self, inString) :
return inString.replace("\'", "") return inString.replace("\'", "")
def parseGeneric(self, baseURL): def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5) # Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
# NoSpin and TV are generic # NoSpin and TV are generic
fullReturn = [] fullReturn = []
for i in range(len(self.catList)) : for i in range(len(self.catList)) :
articleList = [] articleList = []
print("In "+self.catList[i][0]+", index: "+ str(i)) print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1]) soup = self.index_to_soup(self.catList[i][1])
@ -110,7 +106,7 @@ class OReillyPremium(BasicNewsRecipe):
# Problem: 0-2 create many in an array # Problem: 0-2 create many in an array
# 3-5 create one. # 3-5 create one.
# So no for-div for 3-5 # So no for-div for 3-5
if i == 0 : if i == 0 :
print("Starting TV Archives") print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]): for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
@ -151,7 +147,7 @@ class OReillyPremium(BasicNewsRecipe):
print("Returning") print("Returning")
# print fullReturn # print fullReturn
return fullReturn return fullReturn
# build_index() starts with: # build_index() starts with:
# try: # try:
@ -161,7 +157,7 @@ class OReillyPremium(BasicNewsRecipe):
# self.report_progress(0, _('Got feeds from index page')) # self.report_progress(0, _('Got feeds from index page'))
# except NotImplementedError: # except NotImplementedError:
# feeds = self.parse_feeds() # feeds = self.parse_feeds()
# which in turn is from __init__.py # which in turn is from __init__.py
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100, #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
# log=default_log): # log=default_log):
@ -177,10 +173,10 @@ class OReillyPremium(BasicNewsRecipe):
# max_articles_per_feed=max_articles_per_feed) # max_articles_per_feed=max_articles_per_feed)
# feeds.append(pfeed) # feeds.append(pfeed)
# return feeds # return feeds
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article. # use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles) # returns a list of tuple ('feed title', list of articles)
# { # {
@ -201,7 +197,7 @@ class OReillyPremium(BasicNewsRecipe):
masterList = self.parseGeneric(baseURL) masterList = self.parseGeneric(baseURL)
#print(masterList) #print(masterList)
return masterList return masterList
def preprocess_html(self, soup): def preprocess_html(self, soup):
print("In preprocess_html") print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'}) refresh = soup.find('meta', {'http-equiv':'refresh'})
@ -210,22 +206,22 @@ class OReillyPremium(BasicNewsRecipe):
content = refresh.get('content').partition('=')[2] content = refresh.get('content').partition('=')[2]
raw = self.browser.open('https://www.billoreilly.com'+content).read() raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace')) return BeautifulSoup(raw.decode('cp1252', 'replace'))
def build_index(self): def build_index(self):
print("In OReilly build_index()\n\n") print("In OReilly build_index()\n\n")
feedsRSS = [] feedsRSS = []
self.report_progress(0, _('Fetching feeds...')) self.report_progress(0, ('Fetching feeds...'))
#try: #try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed, max_articles_per_feed=self.max_articles_per_feed,
log=self.log) log=self.log)
self.report_progress(0, _('Got feeds from index page')) self.report_progress(0, ('Got feeds from index page'))
#except NotImplementedError: #except NotImplementedError:
# feeds = self.parse_feeds() # feeds = self.parse_feeds()
# Now add regular feeds. # Now add regular feeds.
feedsRSS = self.parse_feeds() feedsRSS = self.parse_feeds()
print ("feedsRSS is type "+feedsRSS.__class__.__name__) print ("feedsRSS is type "+feedsRSS.__class__.__name__)
for articles in feedsRSS: for articles in feedsRSS:
print("articles is type "+articles.__class__.__name__) print("articles is type "+articles.__class__.__name__)
print("Title:" + articles.title) print("Title:" + articles.title)
@ -235,9 +231,9 @@ class OReillyPremium(BasicNewsRecipe):
#feeds = FeedCollection(feeds) #feeds = FeedCollection(feeds)
self.report_progress(0, _('Trying to download cover...')) self.report_progress(0, ('Trying to download cover...'))
self.download_cover() self.download_cover()
self.report_progress(0, _('Generating masthead...')) self.report_progress(0, ('Generating masthead...'))
self.masthead_path = None self.masthead_path = None
try: try:
@ -317,7 +313,7 @@ class OReillyPremium(BasicNewsRecipe):
tp.putRequest(req, block=True, timeout=0) tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads) self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True: while True:
try: try:
tp.poll() tp.poll()
@ -331,8 +327,8 @@ class OReillyPremium(BasicNewsRecipe):
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html) fi.write(html)
self.create_opf(feeds) self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index) self.report_progress(1, ('Feeds downloaded to %s')%index)
return index return index

View File

@ -1,9 +1,9 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
import string, re import re
import time import time
from urlparse import urlparse from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString from calibre.ebooks.BeautifulSoup import NavigableString
class RealClear(BasicNewsRecipe): class RealClear(BasicNewsRecipe):
title = u'Real Clear' title = u'Real Clear'
@ -23,8 +23,8 @@ class RealClear(BasicNewsRecipe):
recursions = 0 recursions = 0
max_articles_per_feed = 400 max_articles_per_feed = 400
debugMessages = True debugMessages = True
# Numeric parameter is type, controls whether we look for # Numeric parameter is type, controls whether we look for
feedsets = [ feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0], ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Policy", "http://www.realclearpolicy.com/index.xml", 0], ["Policy", "http://www.realclearpolicy.com/index.xml", 0],
@ -41,17 +41,17 @@ class RealClear(BasicNewsRecipe):
# Hints to extractPrintURL. # Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'], printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'], ["politico.com", "Print", 'a', 'share-print'],
["nationalreview.com", ">Print<", 'a', ''], ["nationalreview.com", ">Print<", 'a', ''],
["reason.com", "", 'a', 'printer'] ["reason.com", "", 'a', 'printer']
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle # The following are not supported due to JavaScripting, and would require obfuscated_article to handle
# forbes, # forbes,
# usatoday - just prints with all current crap anyhow # usatoday - just prints with all current crap anyhow
] ]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
@ -64,7 +64,7 @@ class RealClear(BasicNewsRecipe):
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1 # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page" # which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url. # Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found. # The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL): def extractPrintURL(self, pageURL):
@ -104,7 +104,7 @@ class RealClear(BasicNewsRecipe):
# print(soup) # print(soup)
print("end soup\n\n"); print("end soup\n\n");
continue continue
print(printFind) print(printFind)
if isinstance(printFind, NavigableString)==False: if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None: if printFind['href'] is not None:
@ -130,7 +130,7 @@ class RealClear(BasicNewsRecipe):
print("In get_browser") print("In get_browser")
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
return br return br
def parseRSS(self, index) : def parseRSS(self, index) :
if self.debugMessages == True : if self.debugMessages == True :
print("\n\nStarting "+self.feedsets[index][0]) print("\n\nStarting "+self.feedsets[index][0])
@ -160,7 +160,7 @@ class RealClear(BasicNewsRecipe):
pubDateEl = div.find("pubDate") pubDateEl = div.find("pubDate")
if pubDateEl is None : if pubDateEl is None :
pubDateEl = div.find("pubdate") pubDateEl = div.find("pubdate")
if pubDateEl is None : if pubDateEl is None :
pubDate = time.strftime('%a, %d %b') pubDate = time.strftime('%a, %d %b')
else : else :
pubDate = pubDateEl.contents[0] pubDate = pubDateEl.contents[0]
@ -176,7 +176,7 @@ class RealClear(BasicNewsRecipe):
pubdate = time.strftime('%a, %d %b') pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList return articleList
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles) # returns a list of tuple ('feed title', list of articles)
# { # {
@ -189,8 +189,8 @@ class RealClear(BasicNewsRecipe):
# this is used instead of BasicNewsRecipe.parse_feeds(). # this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self): def parse_index(self):
# Parse the page into Python Soup # Parse the page into Python Soup
articleList = [] #articleList = []
ans = [] ans = []
feedsCount = len(self.feedsets) feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4 for x in range(0,feedsCount): # should be ,4
@ -200,5 +200,5 @@ class RealClear(BasicNewsRecipe):
if self.debugMessages == True : if self.debugMessages == True :
print(ans) print(ans)
return ans return ans