Merge from trunk

This commit is contained in:
Charles Haley 2012-04-04 11:20:25 +02:00
commit 20ec5de3f4
28 changed files with 989 additions and 312 deletions

82
recipes/ba_herald.recipe Normal file
View File

@ -0,0 +1,82 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.buenosairesherald.com
'''
import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class BuenosAiresHerald(BasicNewsRecipe):
title = 'Buenos Aires Herald'
__author__ = 'Darko Miletic'
description = 'A world of information in a few words'
publisher = 'Editorial Nefir S.A.'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en_AR'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.buenosairesherald.com/img/logo.jpg'
INDEX = 'http://www.buenosairesherald.com'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
h1{font-family: Georgia,serif}
#fecha{text-align: right; font-size: small}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['meta','link','iframe'])]
keep_only_tags = [dict(attrs={'class':'nota_texto p'})]
feeds = [
(u'Argentina' , u'http://www.buenosairesherald.com/argentina' )
,(u'World' , u'http://www.buenosairesherald.com/world' )
,(u'Latin America' , u'http://www.buenosairesherald.com/latin-america' )
,(u'Entertainment' , u'http://www.buenosairesherald.com/entertainment' )
,(u'Sports' , u'http://www.buenosairesherald.com/sports' )
]
def print_version(self, url):
artidraw = url.rpartition('/article/')[2]
artid = artidraw.partition('/')[0]
return 'http://www.buenosairesherald.com/articles/print.aspx?ix=' + artid
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}):
description = self.tag_to_string(item.h2)
atag = item.h2.find('a')
if atag and atag.has_key('href'):
url = self.INDEX + atag['href']
title = description
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

BIN
recipes/icons/ba_herald.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 978 B

View File

@ -1,45 +1,73 @@
# Talking Points is not grabbing everything.
# The look is right, but only the last one added?
import re
import string, re
import time
import traceback
# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
# strip ads and graphics
# Current Column lacks a title.
# Talking Points Memo - shorten title - Remove year and Bill's name
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
import os, time, traceback, re, urlparse, sys, cStringIO
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert --username xxx --password xxx
# this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
title = 'Bill O\'Reilly Premium'
auto_cleanup = True
conversion_options = {'linearize_tables': True}
encoding = 'utf8'
needs_subscription = True
language = 'en'
no_stylesheets = True
oldest_article = 20
needs_subscription = True
oldest_article = 31
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 2000
max_articles_per_feed = 20
debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
# ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
# ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
# ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
# ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
feeds = [
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
]
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
# Now using RSS
def get_browser(self):
print("In get_browser")
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -48,7 +76,7 @@ class OReillyPremium(BasicNewsRecipe):
br['formPasswordField'] = self.password
br.submit()
return br
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, baseURL, pageURL, printString):
@ -62,17 +90,19 @@ class OReillyPremium(BasicNewsRecipe):
tag = printText.parent
tagURL = baseURL+tag['href']
return tagURL
def stripBadChars(self, inString) :
return inString.replace("\'", "")
def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5)
# Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
# NoSpin and TV are generic
fullReturn = []
for i in range(len(self.catList)) :
for i in range(len(self.catList)) :
articleList = []
print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1])
# Set defaults
description = 'None'
@ -80,15 +110,13 @@ class OReillyPremium(BasicNewsRecipe):
# Problem: 0-2 create many in an array
# 3-5 create one.
# So no for-div for 3-5
if i < 3 :
if i == 0 :
print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
print("Next DIV:")
print(div)
if i == 1:
a = div.find('a', href=True)
else :
a = div
print(a)
a = div
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
continue
# url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href']
if i < 2 :
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
elif i == 2 :
# Daily Briefs
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = div.contents[0]
if self.debugMessages :
print(title+" @ "+url)
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
elif i == 3 : # Stratfor
a = soup.find('a', self.catList[i][3])
if a is None :
continue
url = baseURL+a['href']
title = self.tag_to_string(a, use_alt=True).strip()
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
# Changed 12 Jan 2012 - new page format
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
#stratBody = stratSoup.find('td', {'class':['blogBody']})
elif i == 4 : # Talking Points
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
myTable = topDate.findParents('table')[0]
if myTable is not None:
upOneTable = myTable.findParents('table')[0]
if upOneTable is not None:
upTwo = upOneTable.findParents('table')[0]
if upTwo is None:
continue
# Now navigate rows of upTwo
if self.debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
# This looks wrong.
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
url = baseURL+re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if self.debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
else : # Current Column
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
if titleSpan is None :
print("No Current Column Title Span")
print(soup)
continue
title = titleSpan.contents[0]
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
if i == 3 or i == 5 :
if i == 1 :
if self.debugMessages :
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
summary = div.find(True, attrs={'class':'summary'})
if summary:
print("At Summary")
print(summary)
if summary is not None:
description = self.tag_to_string(summary, use_alt=False)
print("At append")
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
self.catList[i][3] = articleList
fullReturn.append((self.catList[i][0], articleList))
print("Returning")
# print fullReturn
return fullReturn
# build_index() starts with:
# try:
# feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
# max_articles_per_feed=self.max_articles_per_feed,
# log=self.log)
# self.report_progress(0, _('Got feeds from index page'))
# except NotImplementedError:
# feeds = self.parse_feeds()
# which in turn is from __init__.py
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
# log=default_log):
#'''
#@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
#@return: A list of L{Feed} objects.
#@rtype: list
#'''
#feeds = []
#for title, articles in index:
# pfeed = Feed(log=log)
# pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
# max_articles_per_feed=max_articles_per_feed)
# feeds.append(pfeed)
# return feeds
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@ -182,16 +191,148 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
# it is called by download
def parse_index(self):
# Parse the page into Python Soup
print("Entering recipe print_index from:")
traceback.print_stack()
print("web")
baseURL = "https://www.billoreilly.com"
return self.parseGeneric(baseURL)
masterList = self.parseGeneric(baseURL)
#print(masterList)
return masterList
def preprocess_html(self, soup):
print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def build_index(self):
print("In OReilly build_index()\n\n")
feedsRSS = []
self.report_progress(0, _('Fetching feeds...'))
#try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page'))
#except NotImplementedError:
# feeds = self.parse_feeds()
# Now add regular feeds.
feedsRSS = self.parse_feeds()
print ("feedsRSS is type "+feedsRSS.__class__.__name__)
for articles in feedsRSS:
print("articles is type "+articles.__class__.__name__)
print("Title:" + articles.title)
feeds.append(articles)
if not feeds:
raise ValueError('No articles found, aborting')
#feeds = FeedCollection(feeds)
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Generating masthead...'))
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
self.jobs = []
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.feed_objects = feeds
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
func, arg = (self.fetch_embedded_article, article) \
if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
else \
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
else self.fetch_article), url)
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(feeds):
print("Writing feeds for "+feed.title)
html = self.feed2index(f,feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
return index

View File

@ -1,7 +1,9 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
import string, re
import time
from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
class RealClear(BasicNewsRecipe):
title = u'Real Clear'
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
# Don't go down
recursions = 0
max_articles_per_feed = 400
debugMessages = False
# Numeric parameter is type, controls whether we look for
debugMessages = True
# Numeric parameter is type, controls whether we look for
feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Policy", "http://www.realclearpolicy.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@ -37,22 +40,37 @@ class RealClear(BasicNewsRecipe):
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
printhints = [
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
["politico.com", "Print", 'a', 'share-print'],
["nationalreview.com", ">Print<", 'a', ''],
["reason.com", "", 'a', 'printer']
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle
# forbes,
# forbes,
# usatoday - just prints with all current crap anyhow
]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL):
tagURL = pageURL
baseParse = urlparse(pageURL)
baseURL = baseParse[0]+"://"+baseParse[1]
hintsCount =len(self.printhints)
for x in range(0,hintsCount):
if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
soup = self.index_to_soup(pageURL)
if soup is None:
return pageURL
if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages == True :
print("search1")
print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
if self.debugMessages == True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
# print(soup)
print("end soup\n\n");
continue
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 :
return baseURL+printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
@ -98,7 +130,7 @@ class RealClear(BasicNewsRecipe):
print("In get_browser")
br = BasicNewsRecipe.get_browser()
return br
def parseRSS(self, index) :
if self.debugMessages == True :
print("\n\nStarting "+self.feedsets[index][0])
@ -128,7 +160,7 @@ class RealClear(BasicNewsRecipe):
pubDateEl = div.find("pubDate")
if pubDateEl is None :
pubDateEl = div.find("pubdate")
if pubDateEl is None :
if pubDateEl is None :
pubDate = time.strftime('%a, %d %b')
else :
pubDate = pubDateEl.contents[0]
@ -144,7 +176,7 @@ class RealClear(BasicNewsRecipe):
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@ -157,7 +189,8 @@ class RealClear(BasicNewsRecipe):
# this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self):
# Parse the page into Python Soup
articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
@ -167,4 +200,5 @@ class RealClear(BasicNewsRecipe):
if self.debugMessages == True :
print(ans)
return ans

View File

@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
auto_cleanup = True
auto_cleanup_keep = '//div[@id="mediaWrapper"]'
simultaneous_downloads = 1
delay = 4
max_connections = 1
@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe):
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
#keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
remove_tags = [
dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
,dict(name=['object','link'])
]
#remove_tags = [
#dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
#,dict(name=['object','link'])
#]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )]
def get_cover_url(self):

136
recipes/southernstar.recipe Normal file
View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2012, watou'
'''
southernstar.ie
'''
import re
import tempfile
import os
import codecs
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class TheSouthernStar(BasicNewsRecipe):
title = 'The Southern Star'
__author__ = 'watou'
description = 'West Cork\'s leading news and information provider since 1889'
NEWS_INDEX = 'http://www.southernstar.ie/news.php'
LOCAL_NOTES = 'http://www.southernstar.ie/localnotes.php'
SPORT_INDEX = 'http://www.southernstar.ie/sport.php'
CLASSIFIEDS = 'http://www.southernstar.ie/classifieds.php'
language = 'en_IE'
encoding = 'cp1252'
publication_type = 'newspaper'
masthead_url = 'http://www.southernstar.ie/images/logo.gif'
remove_tags_before = dict(name='div', attrs={'class':'article'})
remove_tags_after = dict(name='div', attrs={'class':'article'})
remove_tags = [dict(name='div', attrs={'style':'width:300px; position:relative'}),
dict(name='form'),
dict(name='div', attrs={'class':'endpanel'})]
no_stylesheets = True
tempfiles = []
pubdate = ''
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
def parse_index(self):
feeds = []
seen_titles = set([])
articles = self.fetch_ss_articles(self.NEWS_INDEX, seen_titles)
if articles:
feeds.append(('News', articles))
articles = self.fetch_ss_notes(self.LOCAL_NOTES)
if articles:
feeds.append(('Local Notes', articles))
articles = self.fetch_ss_articles(self.SPORT_INDEX, seen_titles)
if articles:
feeds.append(('Sport', articles))
articles = self.fetch_ss_notes(self.CLASSIFIEDS)
if articles:
feeds.append(('Classifieds', articles))
return feeds
def fetch_ss_articles(self, index, seen_titles):
articles = []
soup = self.index_to_soup(index)
ts = soup.find('div', {'class':'article'})
ds = self.tag_to_string(ts.find('strong'))
self.pubdate = ' ['+ds+']'
self.timefmt = ' [%s]'%ds
for post in ts.findAll('h1'):
a = post.find('a', href=True)
title = self.tag_to_string(a)
if title in seen_titles:
continue
seen_titles.add(title)
url = a['href']
if url.startswith('article'):
url = 'http://www.southernstar.ie/'+url
self.log('\tFound article:', title, 'at', url)
p = post.findNextSibling('p')
desc = None
if p is not None:
desc = str(p)
articles.append({'title':title, 'url':url, 'description':desc,
'date':self.pubdate})
return articles
def fetch_ss_notes(self, page):
articles = []
soup = self.index_to_soup(page)
ts = soup.find('div', {'class':'content'})
for post in ts.findAll('h1'):
title = self.tag_to_string(post)
self.log('\tFound note:', title)
f = tempfile.NamedTemporaryFile(suffix='.html',delete=False)
f.close()
f = codecs.open(f.name, 'w+b', self.encoding, 'replace')
url = "file://" + f.name
f.write(u'<html><head><meta http-equiv="Content-Type" content="text/html; charset='+
self.encoding+'"></head><body><h1>'+title+'</h1>')
f.write(str(post.findNextSibling('p')))
f.write(u'</body></html>')
self.log('\tWrote note to', f.name)
f.close()
self.tempfiles.append(f)
articles.append({'title':title, 'url':url, 'date':self.pubdate})
return articles
def postprocess_html(self, soup, first):
for table in soup.findAll('table', align='right'):
img = table.find('img')
if img is not None:
img.extract()
caption = self.tag_to_string(table).strip()
div = Tag(soup, 'div')
div['style'] = 'text-align:center'
div.insert(0, img)
div.insert(1, Tag(soup, 'br'))
if caption:
div.insert(2, NavigableString(caption))
table.replaceWith(div)
return soup
def image_url_processor(self, baseurl, url):
return url.replace(' ','%20')
def cleanup(self):
self.log('cleaning up')
for f in self.tempfiles:
os.unlink(f.name)
self.tempfiles = []

View File

@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC
from setup.installer.windows.wix import WixMixIn
OPENSSL_DIR = r'Q:\openssl'
QT_DIR = 'Q:\\Qt\\4.8.0'
QT_DIR = 'Q:\\Qt\\4.8.1'
QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
SW = r'C:\cygwin\home\kovid\sw'

View File

@ -32,6 +32,7 @@ class MOBIInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
self.is_kf8 = False
if os.environ.get('USE_MOBIUNPACK', None) is not None:
pos = stream.tell()
@ -62,6 +63,7 @@ class MOBIInput(InputFormatPlugin):
mr = Mobi8Reader(mr, log)
opf = os.path.abspath(mr())
self.encrypted_fonts = mr.encrypted_fonts
self.is_kf8 = True
return opf
raw = parse_cache.pop('calibre_raw_mobi_markup', False)

View File

@ -535,7 +535,7 @@ class OPF(object): # {{{
series_index = MetadataField('series_index', is_dc=False,
formatter=float, none_is=1)
title_sort = TitleSortField('title_sort', is_dc=False)
rating = MetadataField('rating', is_dc=False, formatter=int)
rating = MetadataField('rating', is_dc=False, formatter=float)
pubdate = MetadataField('date', formatter=parse_date,
renderer=isoformat)
publication_type = MetadataField('publication_type', is_dc=False)
@ -883,6 +883,8 @@ class OPF(object): # {{{
val = etree.tostring(x, with_tail=False, encoding=unicode,
method='text').strip()
if val and typ not in ('calibre', 'uuid'):
if typ == 'isbn' and val.lower().startswith('urn:isbn:'):
val = val[len('urn:isbn:'):]
identifiers[typ] = val
found_scheme = True
break

View File

@ -0,0 +1,95 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from threading import Event
from io import BytesIO
from calibre.utils.date import as_utc
from calibre.ebooks.metadata.sources.identify import identify, msprefs
from calibre.ebooks.metadata.book.base import Metadata
from calibre.customize.ui import metadata_plugins
from calibre.ebooks.metadata.sources.covers import download_cover
from calibre.utils.logging import GUILog
from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF
def merge_result(oldmi, newmi, ensure_fields=None):
dummy = Metadata(_('Unknown'))
for f in msprefs['ignore_fields']:
if ':' in f or (ensure_fields and f in ensure_fields):
continue
setattr(newmi, f, getattr(dummy, f))
fields = set()
for plugin in metadata_plugins(['identify']):
fields |= plugin.touched_fields
def is_equal(x, y):
if hasattr(x, 'tzinfo'):
x = as_utc(x)
if hasattr(y, 'tzinfo'):
y = as_utc(y)
return x == y
for f in fields:
# Optimize so that set_metadata does not have to do extra work later
if not f.startswith('identifier:'):
if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
getattr(oldmi, f))):
setattr(newmi, f, getattr(dummy, f))
return newmi
def main(do_identify, covers, metadata, ensure_fields):
failed_ids = set()
failed_covers = set()
all_failed = True
log = GUILog()
for book_id, mi in metadata.iteritems():
mi = OPF(BytesIO(mi), basedir=os.getcwdu(),
populate_spine=False).to_book_metadata()
title, authors, identifiers = mi.title, mi.authors, mi.identifiers
cdata = None
log.clear()
if do_identify:
results = []
try:
results = identify(log, Event(), title=title, authors=authors,
identifiers=identifiers)
except:
pass
if results:
all_failed = False
mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
identifiers = mi.identifiers
if not mi.is_null('rating'):
# set_metadata expects a rating out of 10
mi.rating *= 2
with open('%d.mi'%book_id, 'wb') as f:
f.write(metadata_to_opf(mi, default_lang='und'))
else:
log.error('Failed to download metadata for', title)
failed_ids.add(book_id)
if covers:
cdata = download_cover(log, title=title, authors=authors,
identifiers=identifiers)
if cdata is None:
failed_covers.add(book_id)
else:
with open('%d.cover'%book_id, 'wb') as f:
f.write(cdata[-1])
all_failed = False
with open('%d.log'%book_id, 'wb') as f:
f.write(log.plain_text.encode('utf-8'))
return failed_ids, failed_covers, all_failed

View File

@ -217,6 +217,10 @@ class EbookIterator(object):
if hasattr(self.pathtoopf, 'manifest'):
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
if getattr(plumber.input_plugin, 'is_kf8', False):
self.book_format = 'KF8'
self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
if self.opf is None:
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
import os, shutil
from functools import partial
from PyQt4.Qt import QMenu, QModelIndex, QTimer
@ -16,6 +16,7 @@ from calibre.gui2.dialogs.confirm_delete import confirm
from calibre.gui2.dialogs.device_category_editor import DeviceCategoryEditor
from calibre.gui2.actions import InterfaceAction
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.icu import sort_key
from calibre.db.errors import NoSuchFormat
@ -79,17 +80,27 @@ class EditMetadataAction(InterfaceAction):
Dispatcher(self.metadata_downloaded),
ensure_fields=ensure_fields)
def cleanup_bulk_download(self, tdir):
try:
shutil.rmtree(tdir, ignore_errors=True)
except:
pass
def metadata_downloaded(self, job):
if job.failed:
self.gui.job_exception(job, dialog_title=_('Failed to download metadata'))
return
from calibre.gui2.metadata.bulk_download import get_job_details
id_map, failed_ids, failed_covers, all_failed, det_msg = \
get_job_details(job)
(aborted, id_map, tdir, log_file, failed_ids, failed_covers, all_failed,
det_msg, lm_map) = get_job_details(job)
if aborted:
return self.cleanup_bulk_download(tdir)
if all_failed:
num = len(failed_ids | failed_covers)
self.cleanup_bulk_download(tdir)
return error_dialog(self.gui, _('Download failed'),
_('Failed to download metadata or covers for any of the %d'
' book(s).') % len(id_map), det_msg=det_msg, show=True)
' book(s).') % num, det_msg=det_msg, show=True)
self.gui.status_bar.show_message(_('Metadata download completed'), 3000)
@ -103,28 +114,27 @@ class EditMetadataAction(InterfaceAction):
msg += '<p>'+_('Could not download metadata and/or covers for %d of the books. Click'
' "Show details" to see which books.')%num
payload = (id_map, failed_ids, failed_covers)
payload = (id_map, tdir, log_file, lm_map)
from calibre.gui2.dialogs.message_box import ProceedNotification
p = ProceedNotification(self.apply_downloaded_metadata,
payload, job.html_details,
payload, log_file,
_('Download log'), _('Download complete'), msg,
det_msg=det_msg, show_copy_button=show_copy_button,
parent=self.gui)
cancel_callback=lambda x:self.cleanup_bulk_download(tdir),
parent=self.gui, log_is_file=True)
p.show()
def apply_downloaded_metadata(self, payload):
id_map, failed_ids, failed_covers = payload
id_map = dict([(k, v) for k, v in id_map.iteritems() if k not in
failed_ids])
if not id_map:
good_ids, tdir, log_file, lm_map = payload
if not good_ids:
return
modified = set()
db = self.gui.current_db
for i, mi in id_map.iteritems():
for i in good_ids:
lm = db.metadata_last_modified(i, index_is_id=True)
if lm > mi.last_modified:
if lm > lm_map[i]:
title = db.title(i, index_is_id=True)
authors = db.authors(i, index_is_id=True)
if authors:
@ -144,7 +154,18 @@ class EditMetadataAction(InterfaceAction):
'Do you want to proceed?'), det_msg='\n'.join(modified)):
return
self.apply_metadata_changes(id_map)
id_map = {}
for bid in good_ids:
opf = os.path.join(tdir, '%d.mi'%bid)
if not os.path.exists(opf):
opf = None
cov = os.path.join(tdir, '%d.cover'%bid)
if not os.path.exists(cov):
cov = None
id_map[bid] = (opf, cov)
self.apply_metadata_changes(id_map, callback=lambda x:
self.cleanup_bulk_download(tdir))
# }}}
@ -468,13 +489,18 @@ class EditMetadataAction(InterfaceAction):
callback can be either None or a function accepting a single argument,
in which case it is called after applying is complete with the list of
changed ids.
id_map can also be a mapping of ids to 2-tuple's where each 2-tuple
contains the absolute paths to an OPF and cover file respectively. If
either of the paths is None, then the corresponding metadata is not
updated.
'''
if title is None:
title = _('Applying changed metadata')
self.apply_id_map = list(id_map.iteritems())
self.apply_current_idx = 0
self.apply_failures = []
self.applied_ids = []
self.applied_ids = set()
self.apply_pd = None
self.apply_callback = callback
if len(self.apply_id_map) > 1:
@ -492,28 +518,49 @@ class EditMetadataAction(InterfaceAction):
return self.finalize_apply()
i, mi = self.apply_id_map[self.apply_current_idx]
if isinstance(mi, tuple):
opf, cover = mi
if opf:
mi = OPF(open(opf, 'rb'), basedir=os.path.dirname(opf),
populate_spine=False).to_book_metadata()
self.apply_mi(i, mi)
if cover:
self.gui.current_db.set_cover(i, open(cover, 'rb'),
notify=False, commit=False)
self.applied_ids.add(i)
else:
self.apply_mi(i, mi)
self.apply_current_idx += 1
if self.apply_pd is not None:
self.apply_pd.value += 1
QTimer.singleShot(50, self.do_one_apply)
def apply_mi(self, book_id, mi):
db = self.gui.current_db
try:
set_title = not mi.is_null('title')
set_authors = not mi.is_null('authors')
idents = db.get_identifiers(i, index_is_id=True)
idents = db.get_identifiers(book_id, index_is_id=True)
if mi.identifiers:
idents.update(mi.identifiers)
mi.identifiers = idents
if mi.is_null('series'):
mi.series_index = None
if self._am_merge_tags:
old_tags = db.tags(i, index_is_id=True)
old_tags = db.tags(book_id, index_is_id=True)
if old_tags:
tags = [x.strip() for x in old_tags.split(',')] + (
mi.tags if mi.tags else [])
mi.tags = list(set(tags))
db.set_metadata(i, mi, commit=False, set_title=set_title,
db.set_metadata(book_id, mi, commit=False, set_title=set_title,
set_authors=set_authors, notify=False)
self.applied_ids.append(i)
self.applied_ids.add(book_id)
except:
import traceback
self.apply_failures.append((i, traceback.format_exc()))
self.apply_failures.append((book_id, traceback.format_exc()))
try:
if mi.cover:
@ -521,11 +568,6 @@ class EditMetadataAction(InterfaceAction):
except:
pass
self.apply_current_idx += 1
if self.apply_pd is not None:
self.apply_pd.value += 1
QTimer.singleShot(50, self.do_one_apply)
def finalize_apply(self):
db = self.gui.current_db
db.commit()
@ -550,7 +592,7 @@ class EditMetadataAction(InterfaceAction):
if self.applied_ids:
cr = self.gui.library_view.currentIndex().row()
self.gui.library_view.model().refresh_ids(
self.applied_ids, cr)
list(self.applied_ids), cr)
if self.gui.cover_flow:
self.gui.cover_flow.dataChanged()
self.gui.tags_view.recount()
@ -559,7 +601,7 @@ class EditMetadataAction(InterfaceAction):
self.apply_pd = None
try:
if callable(self.apply_callback):
self.apply_callback(self.applied_ids)
self.apply_callback(list(self.applied_ids))
finally:
self.apply_callback = None

View File

@ -160,7 +160,7 @@ class ProceedNotification(MessageBox): # {{{
def __init__(self, callback, payload, html_log, log_viewer_title, title, msg,
det_msg='', show_copy_button=False, parent=None,
cancel_callback=None):
cancel_callback=None, log_is_file=False):
'''
A non modal popup that notifies the user that a background task has
been completed.
@ -175,12 +175,15 @@ class ProceedNotification(MessageBox): # {{{
:param title: The title for this popup
:param msg: The msg to display
:param det_msg: Detailed message
:param log_is_file: If True the html_log parameter is interpreted as
the path to a file on disk containing the log encoded with utf-8
'''
MessageBox.__init__(self, MessageBox.QUESTION, title, msg,
det_msg=det_msg, show_copy_button=show_copy_button,
parent=parent)
self.payload = payload
self.html_log = html_log
self.log_is_file = log_is_file
self.log_viewer_title = log_viewer_title
self.vlb = self.bb.addButton(_('View log'), self.bb.ActionRole)
@ -192,7 +195,11 @@ class ProceedNotification(MessageBox): # {{{
_proceed_memory.append(self)
def show_log(self):
self.log_viewer = ViewLog(self.log_viewer_title, self.html_log,
log = self.html_log
if self.log_is_file:
with open(log, 'rb') as f:
log = f.read().decode('utf-8')
self.log_viewer = ViewLog(self.log_viewer_title, log,
parent=self)
def do_proceed(self, result):

View File

@ -402,7 +402,8 @@ class DetailView(QDialog, Ui_Dialog): # {{{
self.setupUi(self)
self.setWindowTitle(job.description)
self.job = job
self.html_view = hasattr(job, 'html_details')
self.html_view = (hasattr(job, 'html_details') and not getattr(job,
'ignore_html_details', False))
if self.html_view:
self.log.setVisible(False)
else:

View File

@ -7,22 +7,42 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, time, shutil
from functools import partial
from itertools import izip
from threading import Event
from threading import Thread
from PyQt4.Qt import (QIcon, QDialog,
QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt)
from calibre.gui2.threaded_jobs import ThreadedJob
from calibre.ebooks.metadata.sources.identify import identify, msprefs
from calibre.ebooks.metadata.sources.covers import download_cover
from calibre.ebooks.metadata.book.base import Metadata
from calibre.customize.ui import metadata_plugins
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import as_utc
from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.ptempfile import (PersistentTemporaryDirectory,
PersistentTemporaryFile)
# Start download {{{
class Job(ThreadedJob):
ignore_html_details = True
def consolidate_log(self):
self.consolidated_log = self.log.plain_text
self.log = None
def read_consolidated_log(self):
return self.consolidated_log
@property
def details(self):
if self.consolidated_log is None:
return self.log.plain_text
return self.read_consolidated_log()
@property
def log_file(self):
return open(self.download_debug_log, 'rb')
def show_config(gui, parent):
from calibre.gui2.preferences import show_config_widget
show_config_widget('Sharing', 'Metadata download', parent=parent,
@ -104,19 +124,22 @@ def start_download(gui, ids, callback, ensure_fields=None):
d.b.clicked.disconnect()
if ret != d.Accepted:
return
tf = PersistentTemporaryFile('_metadata_bulk.log')
tf.close()
for batch in split_jobs(ids):
job = ThreadedJob('metadata bulk download',
_('Download metadata for %d books')%len(batch),
download, (batch, gui.current_db, d.identify, d.covers,
ensure_fields), {}, callback)
gui.job_manager.run_threaded_job(job)
job = Job('metadata bulk download',
_('Download metadata for %d books')%len(ids),
download, (ids, tf.name, gui.current_db, d.identify, d.covers,
ensure_fields), {}, callback)
job.download_debug_log = tf.name
gui.job_manager.run_threaded_job(job)
gui.status_bar.show_message(_('Metadata download started'), 3000)
# }}}
def get_job_details(job):
id_map, failed_ids, failed_covers, title_map, all_failed = job.result
(aborted, good_ids, tdir, log_file, failed_ids, failed_covers, title_map,
lm_map, all_failed) = job.result
det_msg = []
for i in failed_ids | failed_covers:
title = title_map[i]
@ -126,92 +149,118 @@ def get_job_details(job):
title += (' ' + _('(Failed cover)'))
det_msg.append(title)
det_msg = '\n'.join(det_msg)
return id_map, failed_ids, failed_covers, all_failed, det_msg
return (aborted, good_ids, tdir, log_file, failed_ids, failed_covers,
all_failed, det_msg, lm_map)
def merge_result(oldmi, newmi, ensure_fields=None):
dummy = Metadata(_('Unknown'))
for f in msprefs['ignore_fields']:
if ':' in f or (ensure_fields and f in ensure_fields):
continue
setattr(newmi, f, getattr(dummy, f))
fields = set()
for plugin in metadata_plugins(['identify']):
fields |= plugin.touched_fields
class HeartBeat(object):
CHECK_INTERVAL = 300 # seconds
''' Check that the file count in tdir changes every five minutes '''
def is_equal(x, y):
if hasattr(x, 'tzinfo'):
x = as_utc(x)
if hasattr(y, 'tzinfo'):
y = as_utc(y)
return x == y
def __init__(self, tdir):
self.tdir = tdir
self.last_count = len(os.listdir(self.tdir))
self.last_time = time.time()
for f in fields:
# Optimize so that set_metadata does not have to do extra work later
if not f.startswith('identifier:'):
if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
getattr(oldmi, f))):
setattr(newmi, f, getattr(dummy, f))
def __call__(self):
if time.time() - self.last_time > self.CHECK_INTERVAL:
c = len(os.listdir(self.tdir))
if c == self.last_count:
return False
self.last_count = c
self.last_time = time.time()
return True
newmi.last_modified = oldmi.last_modified
class Notifier(Thread):
return newmi
def __init__(self, notifications, title_map, tdir, total):
Thread.__init__(self)
self.daemon = True
self.notifications, self.title_map = notifications, title_map
self.tdir, self.total = tdir, total
self.seen = set()
self.keep_going = True
def download(ids, db, do_identify, covers, ensure_fields,
def run(self):
while self.keep_going:
try:
names = os.listdir(self.tdir)
except:
pass
else:
for x in names:
if x.endswith('.log'):
try:
book_id = int(x.partition('.')[0])
except:
continue
if book_id not in self.seen and book_id in self.title_map:
self.seen.add(book_id)
self.notifications.put((
float(len(self.seen))/self.total,
_('Processed %s')%self.title_map[book_id]))
time.sleep(1)
def download(all_ids, tf, db, do_identify, covers, ensure_fields,
log=None, abort=None, notifications=None):
ids = list(ids)
metadata = [db.get_metadata(i, index_is_id=True, get_user_categories=False)
for i in ids]
batch_size = 10
batches = split_jobs(all_ids, batch_size=batch_size)
tdir = PersistentTemporaryDirectory('_metadata_bulk')
heartbeat = HeartBeat(tdir)
failed_ids = set()
failed_covers = set()
title_map = {}
ans = {}
count = 0
lm_map = {}
ans = set()
all_failed = True
'''
# Test apply dialog
all_failed = do_identify = covers = False
'''
for i, mi in izip(ids, metadata):
if abort.is_set():
log.error('Aborting...')
break
title, authors, identifiers = mi.title, mi.authors, mi.identifiers
title_map[i] = title
if do_identify:
results = []
aborted = False
count = 0
notifier = Notifier(notifications, title_map, tdir, len(all_ids))
notifier.start()
try:
for ids in batches:
if abort.is_set():
log.error('Aborting...')
break
metadata = {i:db.get_metadata(i, index_is_id=True,
get_user_categories=False) for i in ids}
for i in ids:
title_map[i] = metadata[i].title
lm_map[i] = metadata[i].last_modified
metadata = {i:metadata_to_opf(mi, default_lang='und') for i, mi in
metadata.iteritems()}
try:
results = identify(log, Event(), title=title, authors=authors,
identifiers=identifiers)
except:
pass
if results:
ret = fork_job('calibre.ebooks.metadata.sources.worker', 'main',
(do_identify, covers, metadata, ensure_fields),
cwd=tdir, abort=abort, heartbeat=heartbeat, no_output=True)
except WorkerError as e:
if e.orig_tb:
raise Exception('Failed to download metadata. Original '
'traceback: \n\n'+e.orig_tb)
raise
count += batch_size
fids, fcovs, allf = ret['result']
if not allf:
all_failed = False
mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
identifiers = mi.identifiers
if not mi.is_null('rating'):
# set_metadata expects a rating out of 10
mi.rating *= 2
else:
log.error('Failed to download metadata for', title)
failed_ids.add(i)
# We don't want set_metadata operating on anything but covers
mi = merge_result(mi, mi, ensure_fields=ensure_fields)
if covers:
cdata = download_cover(log, title=title, authors=authors,
identifiers=identifiers)
if cdata is not None:
with PersistentTemporaryFile('.jpg', 'downloaded-cover-') as f:
f.write(cdata[-1])
mi.cover = f.name
all_failed = False
else:
failed_covers.add(i)
ans[i] = mi
count += 1
notifications.put((count/len(ids),
_('Downloaded %(num)d of %(tot)d')%dict(num=count, tot=len(ids))))
log('Download complete, with %d failures'%len(failed_ids))
return (ans, failed_ids, failed_covers, title_map, all_failed)
failed_ids = failed_ids.union(fids)
failed_covers = failed_covers.union(fcovs)
ans = ans.union(set(ids) - fids)
for book_id in ids:
lp = os.path.join(tdir, '%d.log'%book_id)
if os.path.exists(lp):
with open(tf, 'ab') as dest, open(lp, 'rb') as src:
dest.write(('\n'+'#'*20 + ' Log for %s '%title_map[book_id] +
'#'*20+'\n').encode('utf-8'))
shutil.copyfileobj(src, dest)
if abort.is_set():
aborted = True
log('Download complete, with %d failures'%len(failed_ids))
return (aborted, ans, tdir, tf, failed_ids, failed_covers, title_map,
lm_map, all_failed)
finally:
notifier.keep_going = False

View File

@ -161,10 +161,10 @@ class MetadataSingleDialogBase(ResizableDialog):
self.manage_authors_button.clicked.connect(self.authors.manage_authors)
self.series = SeriesEdit(self)
self.remove_unused_series_button = QToolButton(self)
self.remove_unused_series_button.setToolTip(
_('Remove unused series (Series that have no books)') )
self.remove_unused_series_button.clicked.connect(self.remove_unused_series)
self.clear_series_button = QToolButton(self)
self.clear_series_button.setToolTip(
_('Clear series') )
self.clear_series_button.clicked.connect(self.series.clear)
self.series_index = SeriesIndexEdit(self, self.series)
self.basic_metadata_widgets.extend([self.series, self.series_index])
@ -198,6 +198,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.basic_metadata_widgets.append(self.identifiers)
self.clear_identifiers_button = QToolButton(self)
self.clear_identifiers_button.setIcon(QIcon(I('trash.png')))
self.clear_identifiers_button.setToolTip(_('Clear Ids'))
self.clear_identifiers_button.clicked.connect(self.identifiers.clear)
self.paste_isbn_button = QToolButton(self)
self.paste_isbn_button.setToolTip('<p>' +
@ -303,17 +304,6 @@ class MetadataSingleDialogBase(ResizableDialog):
self.title_sort.auto_generate()
self.author_sort.auto_generate()
def remove_unused_series(self, *args):
self.db.remove_unused_series()
idx = self.series.current_val
self.series.clear()
self.series.initialize(self.db, self.book_id)
if idx:
for i in range(self.series.count()):
if unicode(self.series.itemText(i)) == idx:
self.series.setCurrentIndex(i)
break
def tags_editor(self, *args):
self.tags.edit(self.db, self.book_id)
@ -591,7 +581,7 @@ class MetadataSingleDialog(MetadataSingleDialogBase): # {{{
sto(self.title_sort, self.authors)
create_row(1, self.authors, self.deduce_author_sort_button, self.author_sort)
sto(self.author_sort, self.series)
create_row(2, self.series, self.remove_unused_series_button,
create_row(2, self.series, self.clear_series_button,
self.series_index, icon='trash.png')
sto(self.series_index, self.swap_title_author_button)
sto(self.swap_title_author_button, self.manage_authors_button)
@ -756,7 +746,7 @@ class MetadataSingleDialogAlt1(MetadataSingleDialogBase): # {{{
span=2, icon='auto_author_sort.png')
create_row(3, self.author_sort, self.series)
create_row(4, self.series, self.series_index,
button=self.remove_unused_series_button, icon='trash.png')
button=self.clear_series_button, icon='trash.png')
create_row(5, self.series_index, self.tags)
create_row(6, self.tags, self.rating, button=self.tags_editor_button)
create_row(7, self.rating, self.pubdate)
@ -892,7 +882,7 @@ class MetadataSingleDialogAlt2(MetadataSingleDialogBase): # {{{
span=2, icon='auto_author_sort.png')
create_row(3, self.author_sort, self.series)
create_row(4, self.series, self.series_index,
button=self.remove_unused_series_button, icon='trash.png')
button=self.clear_series_button, icon='trash.png')
create_row(5, self.series_index, self.tags)
create_row(6, self.tags, self.rating, button=self.tags_editor_button)
create_row(7, self.rating, self.pubdate)

View File

@ -35,9 +35,7 @@
<string>&lt;p&gt;If you leave the password blank, anyone will be able to
access your book collection using the web interface.
&lt;br&gt;
&lt;p&gt;Note that passwords do not work with Android devices.
Leave this blank if you intend to use the server with an
Android phone or tablet.</string>
&lt;p&gt;Some devices have browsers that do not support authentication. If you are having trouble downloading files from the content server, try removing the password.</string>
</property>
</widget>
</item>
@ -167,17 +165,13 @@ Leave this blank if you intend to use the server with an
</font>
</property>
<property name="toolTip">
<string>&lt;p&gt;Because of a bug in Google's Android, setting a password
will prevent the server from working with Android devices.
&lt;br&gt;
&lt;p&gt;Do not set a password if you plan to use the server with an
Android phone or tablet.</string>
<string>&lt;p&gt;Some devices have browsers that do not support authentication. If you are having trouble downloading files from the content server, trying removing the password.</string>
</property>
<property name="styleSheet">
<string notr="true">QLabel {color:red}</string>
</property>
<property name="text">
<string>Password incompatible with Android devices</string>
<string>Password incompatible with some devices</string>
</property>
</widget>
</item>

View File

@ -241,12 +241,6 @@ def fetch_scheduled_recipe(arg): # {{{
if 'output_profile' in ps:
recs.append(('output_profile', ps['output_profile'],
OptionRecommendation.HIGH))
# Disabled since apparently some people use
# K4PC and, surprise, surprise, it doesn't support
# indexed MOBIs.
#if ps['output_profile'] == 'kindle':
# recs.append(('no_inline_toc', True,
# OptionRecommendation.HIGH))
lf = load_defaults('look_and_feel')
if lf.get('base_font_size', 0.0) != 0.0:

View File

@ -822,7 +822,8 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
as_unicode(r), det_msg=worker.traceback, show=True)
self.close_progress_indicator()
else:
self.metadata.show_opf(self.iterator.opf, os.path.splitext(pathtoebook)[1][1:])
self.metadata.show_opf(self.iterator.opf,
self.iterator.book_format)
self.view.current_language = self.iterator.language
title = self.iterator.opf.title
if not title:
@ -849,7 +850,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
self.current_book_has_toc = bool(self.iterator.toc)
self.current_title = title
self.setWindowTitle(self.base_window_title+' - '+title +
' [%s]'%os.path.splitext(pathtoebook)[1][1:].upper())
' [%s]'%self.iterator.book_format)
self.pos.setMaximum(sum(self.iterator.pages))
self.pos.setSuffix(' / %d'%sum(self.iterator.pages))
self.vertical_scrollbar.setMinimum(100)

View File

@ -15,7 +15,7 @@ from cherrypy.process.plugins import SimplePlugin
from calibre.constants import __appname__, __version__
from calibre.utils.date import fromtimestamp
from calibre.library.server import listen_on, log_access_file, log_error_file
from calibre.library.server.utils import expose
from calibre.library.server.utils import expose, AuthController
from calibre.utils.mdns import publish as publish_zeroconf, \
stop_server as stop_zeroconf, get_external_ip
from calibre.library.server.content import ContentServer
@ -31,10 +31,11 @@ from calibre import prints, as_unicode
class DispatchController(object): # {{{
def __init__(self, prefix, wsgi=False):
def __init__(self, prefix, wsgi=False, auth_controller=None):
self.dispatcher = cherrypy.dispatch.RoutesDispatcher()
self.funcs = []
self.seen = set()
self.auth_controller = auth_controller
self.prefix = prefix if prefix else ''
if wsgi:
self.prefix = ''
@ -44,6 +45,7 @@ class DispatchController(object): # {{{
raise NameError('Route name: '+ repr(name) + ' already used')
self.seen.add(name)
kwargs['action'] = 'f_%d'%len(self.funcs)
aw = kwargs.pop('android_workaround', False)
if route != '/':
route = self.prefix + route
elif self.prefix:
@ -52,6 +54,8 @@ class DispatchController(object): # {{{
self.dispatcher.connect(name+'prefix_extra_trailing',
self.prefix+'/', self, **kwargs)
self.dispatcher.connect(name, route, self, **kwargs)
if self.auth_controller is not None:
func = self.auth_controller(func, aw)
self.funcs.append(expose(func))
def __getattr__(self, attr):
@ -156,6 +160,8 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
self.config = {}
self.is_running = False
self.exception = None
auth_controller = None
self.users_dict = {}
#self.config['/'] = {
# 'tools.sessions.on' : True,
# 'tools.sessions.timeout': 60, # Session times out after 60 minutes
@ -171,15 +177,12 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
}
if opts.password:
self.config['/'] = {
'tools.digest_auth.on' : True,
'tools.digest_auth.realm' : (
'Your calibre library. Username: '
+ opts.username.strip()),
'tools.digest_auth.users' : {opts.username.strip():opts.password.strip()},
}
self.users_dict[opts.username.strip()] = opts.password.strip()
auth_controller = AuthController('Your calibre library',
self.users_dict)
self.__dispatcher__ = DispatchController(self.opts.url_prefix, wsgi)
self.__dispatcher__ = DispatchController(self.opts.url_prefix,
wsgi=wsgi, auth_controller=auth_controller)
for x in self.__class__.__bases__:
if hasattr(x, 'add_routes'):
x.__init__(self)

View File

@ -41,7 +41,8 @@ class ContentServer(object):
connect('root', '/', self.index)
connect('old', '/old', self.old)
connect('get', '/get/{what}/{id}', self.get,
conditions=dict(method=["GET", "HEAD"]))
conditions=dict(method=["GET", "HEAD"]),
android_workaround=True)
connect('static', '/static/{name:.*?}', self.static,
conditions=dict(method=["GET", "HEAD"]))
connect('favicon', '/favicon.png', self.favicon,

View File

@ -5,10 +5,12 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import time, sys
import time, sys, uuid, hashlib
from urllib import quote as quote_, unquote as unquote_
from functools import wraps
import cherrypy
from cherrypy.lib.auth_digest import digest_auth, get_ha1_dict_plain
from calibre import strftime as _strftime, prints, isbytestring
from calibre.utils.date import now as nowf
@ -40,6 +42,7 @@ class Offsets(object):
def expose(func):
@wraps(func)
def do(*args, **kwargs):
self = func.im_self
if self.opts.develop:
@ -54,10 +57,87 @@ def expose(func):
prints('\tTime:', func.__name__, time.time()-start)
return ans
do.__name__ = func.__name__
return do
class AuthController(object):
'''
Implement Digest authentication for the content server. Android browsers
cannot handle HTTP AUTH when downloading files, as the download is handed
off to a separate process. So we use a cookie based authentication scheme
for some endpoints (/get) to allow downloads to work on android. Apparently,
cookies are passed to the download process. The cookie expires after
MAX_AGE seconds.
The android browser appears to send a GET request to the server and only if
that request succeeds is the download handed off to the download process.
Therefore, even if the user clicks Get after MAX_AGE, it should still work.
In fact, we could reduce MAX_AGE, but we leave it high as the download
process might have downloads queued and therefore not start the download
immediately.
Note that this makes the server vulnerable to session-hijacking (i.e. some
one can sniff the traffic and create their own requests to /get with the
appropriate cookie, for an hour). The fix is to use https, but since this
is usually run as a private server, that cannot be done. If you care about
this vulnerability, run the server behind a reverse proxy that uses HTTPS.
'''
MAX_AGE = 3600 # Number of seconds after a successful digest auth for which
# the cookie auth will be allowed
def __init__(self, realm, users_dict):
self.realm = realm
self.users_dict = users_dict
self.secret = bytes(uuid.uuid4().hex)
self.cookie_name = 'android_workaround'
def hashit(self, raw):
return hashlib.sha1(raw).hexdigest()
def __call__(self, func, allow_cookie_auth):
@wraps(func)
def authenticate(*args, **kwargs):
cookie = cherrypy.request.cookie.get(self.cookie_name, None)
if not (allow_cookie_auth and self.is_valid(cookie)):
digest_auth(self.realm, get_ha1_dict_plain(self.users_dict),
self.secret)
cookie = cherrypy.response.cookie
cookie[self.cookie_name] = self.generate_cookie()
cookie[self.cookie_name]['path'] = '/'
cookie[self.cookie_name]['version'] = '1'
return func(*args, **kwargs)
authenticate.im_self = func.im_self
return authenticate
def generate_cookie(self, timestamp=None):
'''
Generate a cookie. The cookie contains a plain text timestamp and a
hashe of the timestamp and the server secret.
'''
timestamp = int(time.time()) if timestamp is None else timestamp
key = self.hashit('%d:%s'%(timestamp, self.secret))
return '%d:%s'%(timestamp, key)
def is_valid(self, cookie):
'''
Check that cookie has not been spoofed (i.e. verify the declared
timestamp against the hashed timestamp). If the timestamps match, check
that the cookie has not expired. Return True iff the cookie has not
been spoofed and has not expired.
'''
try:
timestamp, hashpart = cookie.value.split(':', 1)
timestamp = int(timestamp)
except:
return False
s_timestamp, s_hashpart = self.generate_cookie(timestamp).split(':', 1)
is_valid = s_hashpart == hashpart
return (is_valid and (time.time() - timestamp) < self.MAX_AGE)
def strftime(fmt='%Y/%m/%d %H:%M:%S', dt=None):
if not hasattr(dt, 'timetuple'):

View File

@ -381,6 +381,18 @@ that allows you to create collections on your Kindle from the |app| metadata. It
.. note:: Amazon have removed the ability to manipulate collections completely in their newer models, like the Kindle Touch and Kindle Fire, making even the above plugin useless. If you really want the ability to manage collections on your Kindle via a USB connection, we encourage you to complain to Amazon about it, or get a reader where this is supported, like the SONY Readers.
I am getting an error when I try to use |app| with my Kobo Touch?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Kobo Touch has very buggy firmware. Connecting to it has been known to fail at random. Certain combinations of motherboard, USB ports/cables/hubs can exacerbate this tendency to fail. If you are getting an error when connecting to your touch with |app| try the following, each of which has solved the problem for *some* |app| users.
* Connect the Kobo directly to your computer, not via USB Hub
* Try a different USB cable and a different USB port on your computer
* Try a different computer (preferably an older model)
* Try upgrading the firmware on your Kobo Touch to the latest
* Try resetting the Kobo (sometimes this cures the problem for a little while, but then it re-appears, in which case you have to reset again and again)
* Try only putting one or two books onto the Kobo at a time and do not keep large collections on the Kobo
Library Management
------------------

View File

@ -73,7 +73,7 @@ Edit metadata
|emii| The :guilabel:`Edit metadata` action has four variations which can be accessed by doing a right-click on the button.
1. **Edit metadata individually**: Allows you to edit the metadata of books one-by-one with the option of fetching metadata, including covers, from the Internet. It also allows you to add or remove particular ebook formats from a book.
1. **Edit metadata individually**: Allows you to edit the metadata of books one-by-one with the option of fetching metadata, including covers, from the Internet. It also allows you to add or remove particular ebook formats from a book.
2. **Edit metadata in bulk**: Allows you to edit common metadata fields for large numbers of books simulataneously. It operates on all the books you have selected in the :ref:`Library view <search_sort>`.
3. **Download metadata and covers**: Downloads metadata and covers (if available) for the books that are selected in the book list.
4. **Merge book records**: Gives you the capability of merging the metadata and formats of two or more book records. You can choose to either delete or keep the records that were not clicked first.
@ -117,7 +117,7 @@ View
|vi| The :guilabel:`View` action displays the book in an ebook viewer program. |app| has a built-in viewer for many ebook formats.
For other formats it uses the default operating system application. You can configure which formats should open with the internal viewer via
Preferences->Behavior. If a book has more than one format, you can view a particular format by doing a right-click on the button.
Preferences->Behavior. If a book has more than one format, you can view a particular format by doing a right-click on the button.
.. _send_to_device:
@ -175,7 +175,7 @@ Library
5. **<library name>**: Actions 5, 6 etc... give you immediate switch access between multiple libraries that you have created or attached to. This list contains only the 5 most frequently used libraries. For the complete list, use the Quick Switch menu.
6. **Library maintenance**: Allows you to check the current library for data consistency issues and restore the current library's database from backups.
.. note:: Metadata about your ebooks, e.g. title, author, and tags, is stored in a single file in your |app| library folder called metadata.db. If this file gets corrupted (a very rare event), you can lose the metadata. Fortunately, |app| automatically backs up the metadata for every individual book in the book's folder as an OPF file. By using the Restore Library action under Library Maintenance described above, you can have |app| rebuild the metadata.db file from the individual OPF files for you.
.. note:: Metadata about your ebooks, e.g. title, author, and tags, is stored in a single file in your |app| library folder called metadata.db. If this file gets corrupted (a very rare event), you can lose the metadata. Fortunately, |app| automatically backs up the metadata for every individual book in the book's folder as an OPF file. By using the Restore Library action under Library Maintenance described above, you can have |app| rebuild the metadata.db file from the individual OPF files for you.
You can copy or move books between different libraries (once you have more than one library setup) by right clicking on the book and selecting the action :guilabel:`Copy to library`.
@ -235,7 +235,7 @@ Connect/Share
1. **Connect to folder**: Allows you to connect to any folder on your computer as though it were a device and use all the facilities |app| has for devices with that folder. Useful if your device cannot be supported by |app| but is available as a USB disk.
2. **Connect to iTunes**: Allows you to connect to your iTunes books database as though it were a device. Once the books are sent to iTunes, you can use iTunes to make them available to your various iDevices. This is useful if you would rather not have |app| send books to your iDevice directly.
2. **Connect to iTunes**: Allows you to connect to your iTunes books database as though it were a device. Once the books are sent to iTunes, you can use iTunes to make them available to your various iDevices.
3. **Start Content Server**: Starts |app|'s built-in web server. When started, your |app| library will be accessible via a web browser from the Internet (if you choose). You can configure how the web server is accessed by setting preferences at :guilabel:`Preferences->Sharing->Sharing over the net`
@ -338,9 +338,9 @@ Two other kinds of searches are available: equality search and search using `reg
Equality searches are indicated by prefixing the search string with an equals sign (=). For example, the query
``tag:"=science"`` will match "science", but not "science fiction" or "hard science". Regular expression searches are
indicated by prefixing the search string with a tilde (~). Any `python-compatible regular expression <http://docs.python.org/library/re.html>`_ can
be used. Note that backslashes used to escape special characters in reqular expressions must be doubled because single backslashes will be removed during query parsing. For example, to match a literal parenthesis you must enter ``\\(``. Regular expression searches are 'contains' searches unless the expression contains anchors.
be used. Note that backslashes used to escape special characters in reqular expressions must be doubled because single backslashes will be removed during query parsing. For example, to match a literal parenthesis you must enter ``\\(``. Regular expression searches are 'contains' searches unless the expression contains anchors.
Should you need to search for a string with a leading equals or tilde, prefix the string with a backslash.
Should you need to search for a string with a leading equals or tilde, prefix the string with a backslash.
Enclose search strings with quotes (") if the string contains parenthesis or spaces. For example, to search
for the tag ``Science Fiction`` you would need to search for ``tag:"=science fiction"``. If you search for
@ -362,7 +362,7 @@ The syntax for searching for dates is::
If the date is ambiguous, the current locale is used for date comparison. For example, in an mm/dd/yyyy
locale 2/1/2009 is interpreted as 1 Feb 2009. In a dd/mm/yyyy locale it is interpreted as 2 Jan 2009. Some
special date strings are available. The string ``today`` translates to today's date, whatever it is. The
strings ``yesterday`` and ``thismonth`` (or the translated equivalent in the current language) also work.
strings ``yesterday`` and ``thismonth`` (or the translated equivalent in the current language) also work.
In addition, the string ``daysago`` (also translated) can be used to compare to a date some number of days ago.
For example::

View File

@ -167,7 +167,8 @@ class Worker(object):
'''
exe = self.gui_executable if self.gui else self.executable
env = self.env
env['ORIGWD'] = cwd or os.path.abspath(os.getcwd())
env[b'ORIGWD'] = binascii.hexlify(cPickle.dumps(cwd or
os.path.abspath(os.getcwdu())))
_cwd = cwd
if priority is None:
priority = prefs['worker_process_priority']

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, cPickle, os
import sys, cPickle, os, binascii
from code import InteractiveInterpreter
from Queue import Queue, Empty
from threading import Thread
@ -130,7 +130,7 @@ class Interpreter(InteractiveInterpreter): # {{{
# }}}
def connect():
os.chdir(os.environ['ORIGWD'])
os.chdir(cPickle.loads(binascii.unhexlify(os.environ['ORIGWD'])))
address = cPickle.loads(unhexlify(os.environ['CALIBRE_WORKER_ADDRESS']))
key = unhexlify(os.environ['CALIBRE_WORKER_KEY'])
return Client(address, authkey=key)

View File

@ -648,7 +648,10 @@ class BasicNewsRecipe(Recipe):
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
'content' : The full article (can be an empty string). Obsolete
do not use, instead save the content to a temporary
file and pass a file:///path/to/temp/file.html as
the URL.
}
For an example, see the recipe for downloading `The Atlantic`.

View File

@ -33,7 +33,8 @@ qop_auth = 'auth'
qop_auth_int = 'auth-int'
valid_qops = (qop_auth, qop_auth_int)
valid_algorithms = ('MD5', 'MD5-sess')
valid_algorithms = ('MD5', 'MD5-sess', 'md5', 'md5-sess') # Changed by Kovid to
# add lowercase
def TRACE(msg):
@ -67,7 +68,7 @@ def get_ha1_dict(user_ha1_dict):
argument to digest_auth().
"""
def get_ha1(realm, username):
return user_ha1_dict.get(user)
return user_ha1_dict.get(username) # Changed by Kovid to fix typo
return get_ha1
@ -107,10 +108,10 @@ def synthesize_nonce(s, key, timestamp=None):
key
A secret string known only to the server.
timestamp
An integer seconds-since-the-epoch timestamp
"""
if timestamp is None:
timestamp = int(time.time())
@ -190,10 +191,10 @@ class HttpDigestAuthorization (object):
s
A string related to the resource, such as the hostname of the server.
key
A secret string known only to the server.
Both s and key must be the same values which were used to synthesize the nonce
we are trying to validate.
"""
@ -256,7 +257,7 @@ class HttpDigestAuthorization (object):
4.3. This refers to the entity the user agent sent in the request which
has the Authorization header. Typically GET requests don't have an entity,
and POST requests do.
"""
ha2 = self.HA2(entity_body)
# Request-Digest -- RFC 2617 3.2.2.1
@ -302,16 +303,16 @@ def www_authenticate(realm, key, algorithm='MD5', nonce=None, qop=qop_auth, stal
def digest_auth(realm, get_ha1, key, debug=False):
"""A CherryPy tool which hooks at before_handler to perform
HTTP Digest Access Authentication, as specified in :rfc:`2617`.
If the request has an 'authorization' header with a 'Digest' scheme, this
tool authenticates the credentials supplied in that header. If
the request has no 'authorization' header, or if it does but the scheme is
not "Digest", or if authentication fails, the tool sends a 401 response with
a 'WWW-Authenticate' Digest header.
realm
A string containing the authentication realm.
get_ha1
A callable which looks up a username in a credentials store
and returns the HA1 string, which is defined in the RFC to be
@ -320,13 +321,13 @@ def digest_auth(realm, get_ha1, key, debug=False):
where username is obtained from the request's 'authorization' header.
If username is not found in the credentials store, get_ha1() returns
None.
key
A secret string known only to the server, used in the synthesis of nonces.
"""
request = cherrypy.serving.request
auth_header = request.headers.get('authorization')
nonce_is_stale = False
if auth_header is not None:
@ -334,10 +335,10 @@ def digest_auth(realm, get_ha1, key, debug=False):
auth = HttpDigestAuthorization(auth_header, request.method, debug=debug)
except ValueError:
raise cherrypy.HTTPError(400, "The Authorization header could not be parsed.")
if debug:
TRACE(str(auth))
if auth.validate_nonce(realm, key):
ha1 = get_ha1(realm, auth.username)
if ha1 is not None:
@ -355,7 +356,7 @@ def digest_auth(realm, get_ha1, key, debug=False):
if debug:
TRACE("authentication of %s successful" % auth.username)
return
# Respond with 401 status and a WWW-Authenticate header
header = www_authenticate(realm, key, stale=nonce_is_stale)
if debug: