This commit is contained in:
GRiker 2012-04-02 04:11:11 -06:00
commit bf2850019e
26 changed files with 776 additions and 337 deletions

View File

@ -13,7 +13,7 @@ class HighCountryNews(BasicNewsRecipe):
__author__ = 'Armin Geller' # 2012-01-31
publisher = 'High Country News'
timefmt = ' [%a, %d %b %Y]'
language = 'en-Us'
language = 'en'
encoding = 'UTF-8'
publication_type = 'newspaper'
oldest_article = 7

View File

@ -1,45 +1,73 @@
# Talking Points is not grabbing everything.
# The look is right, but only the last one added?
import re
import string, re
import time
import traceback
# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
# strip ads and graphics
# Current Column lacks a title.
# Talking Points Memo - shorten title - Remove year and Bill's name
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
import os, time, traceback, re, urlparse, sys, cStringIO
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert --username xxx --password xxx
# this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
title = 'Bill O\'Reilly Premium'
auto_cleanup = True
conversion_options = {'linearize_tables': True}
encoding = 'utf8'
needs_subscription = True
language = 'en'
no_stylesheets = True
oldest_article = 20
needs_subscription = True
oldest_article = 31
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 2000
max_articles_per_feed = 20
debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
# ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
# ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
# ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
# ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
feeds = [
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
]
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
# Now using RSS
def get_browser(self):
print("In get_browser")
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -48,7 +76,7 @@ class OReillyPremium(BasicNewsRecipe):
br['formPasswordField'] = self.password
br.submit()
return br
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, baseURL, pageURL, printString):
@ -62,17 +90,19 @@ class OReillyPremium(BasicNewsRecipe):
tag = printText.parent
tagURL = baseURL+tag['href']
return tagURL
def stripBadChars(self, inString) :
return inString.replace("\'", "")
def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5)
# Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
# NoSpin and TV are generic
fullReturn = []
for i in range(len(self.catList)) :
for i in range(len(self.catList)) :
articleList = []
print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1])
# Set defaults
description = 'None'
@ -80,15 +110,13 @@ class OReillyPremium(BasicNewsRecipe):
# Problem: 0-2 create many in an array
# 3-5 create one.
# So no for-div for 3-5
if i < 3 :
if i == 0 :
print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
print("Next DIV:")
print(div)
if i == 1:
a = div.find('a', href=True)
else :
a = div
print(a)
a = div
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
continue
# url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href']
if i < 2 :
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
elif i == 2 :
# Daily Briefs
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = div.contents[0]
if self.debugMessages :
print(title+" @ "+url)
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
elif i == 3 : # Stratfor
a = soup.find('a', self.catList[i][3])
if a is None :
continue
url = baseURL+a['href']
title = self.tag_to_string(a, use_alt=True).strip()
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
# Changed 12 Jan 2012 - new page format
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
#stratBody = stratSoup.find('td', {'class':['blogBody']})
elif i == 4 : # Talking Points
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
myTable = topDate.findParents('table')[0]
if myTable is not None:
upOneTable = myTable.findParents('table')[0]
if upOneTable is not None:
upTwo = upOneTable.findParents('table')[0]
if upTwo is None:
continue
# Now navigate rows of upTwo
if self.debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
# This looks wrong.
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
url = baseURL+re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if self.debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
else : # Current Column
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
if titleSpan is None :
print("No Current Column Title Span")
print(soup)
continue
title = titleSpan.contents[0]
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
if i == 3 or i == 5 :
if i == 1 :
if self.debugMessages :
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
summary = div.find(True, attrs={'class':'summary'})
if summary:
print("At Summary")
print(summary)
if summary is not None:
description = self.tag_to_string(summary, use_alt=False)
print("At append")
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
self.catList[i][3] = articleList
fullReturn.append((self.catList[i][0], articleList))
print("Returning")
# print fullReturn
return fullReturn
# build_index() starts with:
# try:
# feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
# max_articles_per_feed=self.max_articles_per_feed,
# log=self.log)
# self.report_progress(0, _('Got feeds from index page'))
# except NotImplementedError:
# feeds = self.parse_feeds()
# which in turn is from __init__.py
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
# log=default_log):
#'''
#@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
#@return: A list of L{Feed} objects.
#@rtype: list
#'''
#feeds = []
#for title, articles in index:
# pfeed = Feed(log=log)
# pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
# max_articles_per_feed=max_articles_per_feed)
# feeds.append(pfeed)
# return feeds
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@ -182,16 +191,148 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
# it is called by download
def parse_index(self):
# Parse the page into Python Soup
print("Entering recipe print_index from:")
traceback.print_stack()
print("web")
baseURL = "https://www.billoreilly.com"
return self.parseGeneric(baseURL)
masterList = self.parseGeneric(baseURL)
#print(masterList)
return masterList
def preprocess_html(self, soup):
print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def build_index(self):
print("In OReilly build_index()\n\n")
feedsRSS = []
self.report_progress(0, _('Fetching feeds...'))
#try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page'))
#except NotImplementedError:
# feeds = self.parse_feeds()
# Now add regular feeds.
feedsRSS = self.parse_feeds()
print ("feedsRSS is type "+feedsRSS.__class__.__name__)
for articles in feedsRSS:
print("articles is type "+articles.__class__.__name__)
print("Title:" + articles.title)
feeds.append(articles)
if not feeds:
raise ValueError('No articles found, aborting')
#feeds = FeedCollection(feeds)
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Generating masthead...'))
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
self.jobs = []
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.feed_objects = feeds
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
func, arg = (self.fetch_embedded_article, article) \
if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
else \
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
else self.fetch_article), url)
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(feeds):
print("Writing feeds for "+feed.title)
html = self.feed2index(f,feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
return index

View File

@ -1,7 +1,9 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
import string, re
import time
from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
class RealClear(BasicNewsRecipe):
title = u'Real Clear'
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
# Don't go down
recursions = 0
max_articles_per_feed = 400
debugMessages = False
# Numeric parameter is type, controls whether we look for
debugMessages = True
# Numeric parameter is type, controls whether we look for
feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Policy", "http://www.realclearpolicy.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@ -37,22 +40,37 @@ class RealClear(BasicNewsRecipe):
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
printhints = [
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
["politico.com", "Print", 'a', 'share-print'],
["nationalreview.com", ">Print<", 'a', ''],
["reason.com", "", 'a', 'printer']
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle
# forbes,
# forbes,
# usatoday - just prints with all current crap anyhow
]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL):
tagURL = pageURL
baseParse = urlparse(pageURL)
baseURL = baseParse[0]+"://"+baseParse[1]
hintsCount =len(self.printhints)
for x in range(0,hintsCount):
if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
soup = self.index_to_soup(pageURL)
if soup is None:
return pageURL
if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages == True :
print("search1")
print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
if self.debugMessages == True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
# print(soup)
print("end soup\n\n");
continue
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 :
return baseURL+printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
@ -98,7 +130,7 @@ class RealClear(BasicNewsRecipe):
print("In get_browser")
br = BasicNewsRecipe.get_browser()
return br
def parseRSS(self, index) :
if self.debugMessages == True :
print("\n\nStarting "+self.feedsets[index][0])
@ -128,7 +160,7 @@ class RealClear(BasicNewsRecipe):
pubDateEl = div.find("pubDate")
if pubDateEl is None :
pubDateEl = div.find("pubdate")
if pubDateEl is None :
if pubDateEl is None :
pubDate = time.strftime('%a, %d %b')
else :
pubDate = pubDateEl.contents[0]
@ -144,7 +176,7 @@ class RealClear(BasicNewsRecipe):
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@ -157,7 +189,8 @@ class RealClear(BasicNewsRecipe):
# this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self):
# Parse the page into Python Soup
articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
@ -167,4 +200,5 @@ class RealClear(BasicNewsRecipe):
if self.debugMessages == True :
print(ans)
return ans

View File

@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
auto_cleanup = True
auto_cleanup_keep = '//div[@id="mediaWrapper"]'
simultaneous_downloads = 1
delay = 4
max_connections = 1
@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe):
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
#keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
remove_tags = [
dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
,dict(name=['object','link'])
]
#remove_tags = [
#dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
#,dict(name=['object','link'])
#]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )]
def get_cover_url(self):

View File

@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC
from setup.installer.windows.wix import WixMixIn
OPENSSL_DIR = r'Q:\openssl'
QT_DIR = 'Q:\\Qt\\4.8.0'
QT_DIR = 'Q:\\Qt\\4.8.1'
QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
SW = r'C:\cygwin\home\kovid\sw'

View File

@ -107,6 +107,7 @@ class ANDROID(USBMS):
0xc004 : [0x0226],
0x8801 : [0x0226, 0x0227],
0xe115 : [0x0216], # PocketBook A10
0xe107 : [0x326], # PocketBook 622
},
# Acer

View File

@ -0,0 +1,95 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from threading import Event
from io import BytesIO
from calibre.utils.date import as_utc
from calibre.ebooks.metadata.sources.identify import identify, msprefs
from calibre.ebooks.metadata.book.base import Metadata
from calibre.customize.ui import metadata_plugins
from calibre.ebooks.metadata.sources.covers import download_cover
from calibre.utils.logging import GUILog
from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF
def merge_result(oldmi, newmi, ensure_fields=None):
dummy = Metadata(_('Unknown'))
for f in msprefs['ignore_fields']:
if ':' in f or (ensure_fields and f in ensure_fields):
continue
setattr(newmi, f, getattr(dummy, f))
fields = set()
for plugin in metadata_plugins(['identify']):
fields |= plugin.touched_fields
def is_equal(x, y):
if hasattr(x, 'tzinfo'):
x = as_utc(x)
if hasattr(y, 'tzinfo'):
y = as_utc(y)
return x == y
for f in fields:
# Optimize so that set_metadata does not have to do extra work later
if not f.startswith('identifier:'):
if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
getattr(oldmi, f))):
setattr(newmi, f, getattr(dummy, f))
return newmi
def main(do_identify, covers, metadata, ensure_fields):
failed_ids = set()
failed_covers = set()
all_failed = True
log = GUILog()
for book_id, mi in metadata.iteritems():
mi = OPF(BytesIO(mi), basedir=os.getcwdu(),
populate_spine=False).to_book_metadata()
title, authors, identifiers = mi.title, mi.authors, mi.identifiers
cdata = None
log.clear()
if do_identify:
results = []
try:
results = identify(log, Event(), title=title, authors=authors,
identifiers=identifiers)
except:
pass
if results:
all_failed = False
mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
identifiers = mi.identifiers
if not mi.is_null('rating'):
# set_metadata expects a rating out of 10
mi.rating *= 2
with open('%d.mi'%book_id, 'wb') as f:
f.write(metadata_to_opf(mi, default_lang='und'))
else:
log.error('Failed to download metadata for', title)
failed_ids.add(book_id)
if covers:
cdata = download_cover(log, title=title, authors=authors,
identifiers=identifiers)
if cdata is None:
failed_covers.add(book_id)
else:
with open('%d.cover'%book_id, 'wb') as f:
f.write(cdata[-1])
all_failed = False
with open('%d.log'%book_id, 'wb') as f:
f.write(log.plain_text.encode('utf-8'))
return failed_ids, failed_covers, all_failed

View File

@ -10,13 +10,19 @@ __docformat__ = 'restructuredtext en'
import struct, re, os, imghdr
from collections import namedtuple
from itertools import repeat
from urlparse import urldefrag
from lxml import etree
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.ebooks.mobi.reader.index import read_index
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.oeb.parse_utils import parse_html
from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
Part = namedtuple('Part',
'num type filename start end aid')
@ -383,6 +389,19 @@ class Mobi8Reader(object):
len(resource_map)):
mi.cover = resource_map[self.cover_offset]
if len(list(toc)) < 2:
self.log.warn('KF8 has no metadata Table of Contents')
for ref in guide:
if ref.type == 'toc':
href = ref.href()
href, frag = urldefrag(href)
if os.path.exists(href.replace('/', os.sep)):
try:
toc = self.read_inline_toc(href, frag)
except:
self.log.exception('Failed to read inline ToC')
opf = OPFCreator(os.getcwdu(), mi)
opf.guide = guide
@ -397,4 +416,70 @@ class Mobi8Reader(object):
opf.render(of, ncx, 'toc.ncx')
return 'metadata.opf'
def read_inline_toc(self, href, frag):
ans = TOC()
base_href = '/'.join(href.split('/')[:-1])
with open(href.replace('/', os.sep), 'rb') as f:
raw = f.read().decode(self.header.codec)
root = parse_html(raw, log=self.log)
body = XPath('//h:body')(root)
reached = False
if body:
start = body[0]
else:
start = None
reached = True
if frag:
elems = XPath('//*[@id="%s"]'%frag)
if elems:
start = elems[0]
def node_depth(elem):
ans = 0
parent = elem.getparent()
while parent is not None:
parent = parent.getparent()
ans += 1
return ans
# Layer the ToC based on nesting order in the source HTML
current_depth = None
parent = ans
seen = set()
links = []
for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == XHTML('a') and elem.get('href',
False):
href = elem.get('href')
href, frag = urldefrag(href)
href = base_href + '/' + href
text = xml2text(elem).strip()
if (text, href, frag) in seen:
continue
seen.add((text, href, frag))
links.append((text, href, frag, node_depth(elem)))
elif elem is start:
reached = True
depths = sorted(set(x[-1] for x in links))
depth_map = {x:i for i, x in enumerate(depths)}
for text, href, frag, depth in links:
depth = depth_map[depth]
if current_depth is None:
current_depth = 0
parent.add_item(href, frag, text)
elif current_depth == depth:
parent.add_item(href, frag, text)
elif current_depth < depth:
parent = parent[-1] if len(parent) > 0 else parent
parent.add_item(href, frag, text)
current_depth += 1
else:
delta = current_depth - depth
while delta > 0 and parent.parent is not None:
parent = parent.parent
delta -= 1
parent.add_item(href, frag, text)
current_depth = depth
return ans

View File

@ -40,27 +40,34 @@ def get_custom_size(opts):
custom_size = None
return custom_size
def get_pdf_printer(opts, for_comic=False):
def get_pdf_printer(opts, for_comic=False, output_file_name=None):
from calibre.gui2 import is_ok_to_use_qt
if not is_ok_to_use_qt():
raise Exception('Not OK to use Qt')
printer = QPrinter(QPrinter.HighResolution)
custom_size = get_custom_size(opts)
if opts.output_profile.short_name == 'default' or \
opts.output_profile.width > 9999:
if custom_size is None:
printer.setPaperSize(paper_size(opts.paper_size))
else:
printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit))
if isosx and not for_comic:
# On OSX, the native engine can only produce a single page size
# (usually A4). The Qt engine on the other hand produces image based
# PDFs. If we set a custom page size using QSizeF the native engine
# produces unreadable output, so we just ignore the custom size
# settings.
printer.setPaperSize(paper_size(opts.paper_size))
else:
w = opts.output_profile.comic_screen_size[0] if for_comic else \
opts.output_profile.width
h = opts.output_profile.comic_screen_size[1] if for_comic else \
opts.output_profile.height
dpi = opts.output_profile.dpi
printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch)
if opts.output_profile.short_name == 'default' or \
opts.output_profile.width > 9999:
if custom_size is None:
printer.setPaperSize(paper_size(opts.paper_size))
else:
printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit))
else:
w = opts.output_profile.comic_screen_size[0] if for_comic else \
opts.output_profile.width
h = opts.output_profile.comic_screen_size[1] if for_comic else \
opts.output_profile.height
dpi = opts.output_profile.dpi
printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch)
if for_comic:
# Comic pages typically have their own margins, or their background
@ -72,6 +79,12 @@ def get_pdf_printer(opts, for_comic=False):
printer.setOrientation(orientation(opts.orientation))
printer.setOutputFormat(QPrinter.PdfFormat)
printer.setFullPage(for_comic)
if output_file_name:
printer.setOutputFileName(output_file_name)
if isosx and not for_comic:
# Ensure we are not generating enormous image based PDFs
printer.setOutputFormat(QPrinter.NativeFormat)
return printer
def get_printer_page_size(opts, for_comic=False):
@ -163,15 +176,7 @@ class PDFWriter(QObject): # {{{
if ok:
item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue))
self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue)))
printer = get_pdf_printer(self.opts)
printer.setOutputFileName(item_path)
# We have to set the engine to Native on OS X after the call to set
# filename. Setting a filename with .pdf as the extension causes
# Qt to set the format to use Qt's PDF engine even if native was
# previously set on the printer. Qt's PDF engine produces image
# based PDFs on OS X, so we cannot use it.
if isosx:
printer.setOutputFormat(QPrinter.NativeFormat)
printer = get_pdf_printer(self.opts, output_file_name=item_path)
self.view.page().mainFrame().evaluateJavaScript('''
document.body.style.backgroundColor = "white";
@ -193,10 +198,7 @@ class PDFWriter(QObject): # {{{
if self.cover_data is None:
return
item_path = os.path.join(self.tmp_path, 'cover.pdf')
printer = get_pdf_printer(self.opts)
printer.setOutputFileName(item_path)
if isosx:
printer.setOutputFormat(QPrinter.NativeFormat)
printer = get_pdf_printer(self.opts, output_file_name=item_path)
self.combine_queue.insert(0, item_path)
p = QPixmap()
p.loadFromData(self.cover_data)
@ -248,10 +250,8 @@ class ImagePDFWriter(object):
os.remove(f.name)
def render_images(self, outpath, mi, items):
printer = get_pdf_printer(self.opts, for_comic=True)
printer.setOutputFileName(outpath)
if isosx:
printer.setOutputFormat(QPrinter.NativeFormat)
printer = get_pdf_printer(self.opts, for_comic=True,
output_file_name=outpath)
printer.setDocName(mi.title)
printer.setCreator(u'%s [%s]'%(__appname__, __version__))
# Seems to be no way to set author

View File

@ -105,6 +105,7 @@ gprefs.defaults['show_files_after_save'] = True
gprefs.defaults['auto_add_path'] = None
gprefs.defaults['auto_add_check_for_duplicates'] = False
gprefs.defaults['blocked_auto_formats'] = []
gprefs.defaults['auto_add_auto_convert'] = True
# }}}
NONE = QVariant() #: Null value to return from the data function of item models

View File

@ -71,7 +71,7 @@ class AddAction(InterfaceAction):
ma('add-formats', _('Add files to selected book records'),
triggered=self.add_formats, shortcut=_('Shift+A'))
self.add_menu.addSeparator()
ma('add-config', _('Configure the adding of books'),
ma('add-config', _('Control the adding of books'),
triggered=self.add_config)
self.qaction.triggered.connect(self.add_books)

View File

@ -53,6 +53,24 @@ class ConvertAction(InterfaceAction):
self.queue_convert_jobs(jobs, changed, bad, rows, previous,
self.book_auto_converted, extra_job_args=[on_card])
def auto_convert_auto_add(self, book_ids):
previous = self.gui.library_view.currentIndex()
db = self.gui.current_db
needed = set()
of = prefs['output_format'].lower()
for book_id in book_ids:
fmts = db.formats(book_id, index_is_id=True)
fmts = set(x.lower() for x in fmts.split(',')) if fmts else set()
if of not in fmts:
needed.add(book_id)
if needed:
jobs, changed, bad = convert_single_ebook(self.gui,
self.gui.library_view.model().db, needed, True, of,
show_no_format_warning=False)
if not jobs: return
self.queue_convert_jobs(jobs, changed, bad, list(needed), previous,
self.book_converted, rows_are_ids=True)
def auto_convert_mail(self, to, fmts, delete_from_library, book_ids, format, subject):
previous = self.gui.library_view.currentIndex()
rows = [x.row() for x in \
@ -118,7 +136,7 @@ class ConvertAction(InterfaceAction):
num, 2000)
def queue_convert_jobs(self, jobs, changed, bad, rows, previous,
converted_func, extra_job_args=[]):
converted_func, extra_job_args=[], rows_are_ids=False):
for func, args, desc, fmt, id, temp_files in jobs:
func, _, same_fmt = func.partition(':')
same_fmt = same_fmt == 'same_fmt'
@ -140,7 +158,11 @@ class ConvertAction(InterfaceAction):
self.conversion_jobs[job] = tuple(args)
if changed:
self.gui.library_view.model().refresh_rows(rows)
m = self.gui.library_view.model()
if rows_are_ids:
m.refresh_ids(rows)
else:
m.refresh_rows(rows)
current = self.gui.library_view.currentIndex()
self.gui.library_view.model().current_changed(current, previous)

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
import os, shutil
from functools import partial
from PyQt4.Qt import QMenu, QModelIndex, QTimer
@ -16,6 +16,7 @@ from calibre.gui2.dialogs.confirm_delete import confirm
from calibre.gui2.dialogs.device_category_editor import DeviceCategoryEditor
from calibre.gui2.actions import InterfaceAction
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.icu import sort_key
from calibre.db.errors import NoSuchFormat
@ -79,14 +80,23 @@ class EditMetadataAction(InterfaceAction):
Dispatcher(self.metadata_downloaded),
ensure_fields=ensure_fields)
def cleanup_bulk_download(self, tdir):
try:
shutil.rmtree(tdir, ignore_errors=True)
except:
pass
def metadata_downloaded(self, job):
if job.failed:
self.gui.job_exception(job, dialog_title=_('Failed to download metadata'))
return
from calibre.gui2.metadata.bulk_download import get_job_details
id_map, failed_ids, failed_covers, all_failed, det_msg = \
get_job_details(job)
(aborted, id_map, tdir, log_file, failed_ids, failed_covers, all_failed,
det_msg, lm_map) = get_job_details(job)
if aborted:
return self.cleanup_bulk_download(tdir)
if all_failed:
self.cleanup_bulk_download(tdir)
return error_dialog(self.gui, _('Download failed'),
_('Failed to download metadata or covers for any of the %d'
' book(s).') % len(id_map), det_msg=det_msg, show=True)
@ -103,28 +113,26 @@ class EditMetadataAction(InterfaceAction):
msg += '<p>'+_('Could not download metadata and/or covers for %d of the books. Click'
' "Show details" to see which books.')%num
payload = (id_map, failed_ids, failed_covers)
payload = (id_map, tdir, log_file, lm_map)
from calibre.gui2.dialogs.message_box import ProceedNotification
p = ProceedNotification(self.apply_downloaded_metadata,
payload, job.html_details,
payload, log_file,
_('Download log'), _('Download complete'), msg,
det_msg=det_msg, show_copy_button=show_copy_button,
parent=self.gui)
parent=self.gui, log_is_file=True)
p.show()
def apply_downloaded_metadata(self, payload):
id_map, failed_ids, failed_covers = payload
id_map = dict([(k, v) for k, v in id_map.iteritems() if k not in
failed_ids])
if not id_map:
good_ids, tdir, log_file, lm_map = payload
if not good_ids:
return
modified = set()
db = self.gui.current_db
for i, mi in id_map.iteritems():
for i in good_ids:
lm = db.metadata_last_modified(i, index_is_id=True)
if lm > mi.last_modified:
if lm > lm_map[i]:
title = db.title(i, index_is_id=True)
authors = db.authors(i, index_is_id=True)
if authors:
@ -144,7 +152,18 @@ class EditMetadataAction(InterfaceAction):
'Do you want to proceed?'), det_msg='\n'.join(modified)):
return
self.apply_metadata_changes(id_map)
id_map = {}
for bid in good_ids:
opf = os.path.join(tdir, '%d.mi'%bid)
if not os.path.exists(opf):
opf = None
cov = os.path.join(tdir, '%d.cover'%bid)
if not os.path.exists(cov):
cov = None
id_map[bid] = (opf, cov)
self.apply_metadata_changes(id_map, callback=lambda x:
self.cleanup_bulk_download(tdir))
# }}}
@ -468,6 +487,11 @@ class EditMetadataAction(InterfaceAction):
callback can be either None or a function accepting a single argument,
in which case it is called after applying is complete with the list of
changed ids.
id_map can also be a mapping of ids to 2-tuple's where each 2-tuple
contains the absolute paths to an OPF and cover file respectively. If
either of the paths is None, then the corresponding metadata is not
updated.
'''
if title is None:
title = _('Applying changed metadata')
@ -492,28 +516,48 @@ class EditMetadataAction(InterfaceAction):
return self.finalize_apply()
i, mi = self.apply_id_map[self.apply_current_idx]
if isinstance(mi, tuple):
opf, cover = mi
if opf:
mi = OPF(open(opf, 'rb'), basedir=os.path.dirname(opf),
populate_spine=False).to_book_metadata()
self.apply_mi(i, mi)
if cover:
self.gui.current_db.set_cover(i, open(cover, 'rb'),
notify=False, commit=False)
else:
self.apply_mi(i, mi)
self.apply_current_idx += 1
if self.apply_pd is not None:
self.apply_pd.value += 1
QTimer.singleShot(50, self.do_one_apply)
def apply_mi(self, book_id, mi):
db = self.gui.current_db
try:
set_title = not mi.is_null('title')
set_authors = not mi.is_null('authors')
idents = db.get_identifiers(i, index_is_id=True)
idents = db.get_identifiers(book_id, index_is_id=True)
if mi.identifiers:
idents.update(mi.identifiers)
mi.identifiers = idents
if mi.is_null('series'):
mi.series_index = None
if self._am_merge_tags:
old_tags = db.tags(i, index_is_id=True)
old_tags = db.tags(book_id, index_is_id=True)
if old_tags:
tags = [x.strip() for x in old_tags.split(',')] + (
mi.tags if mi.tags else [])
mi.tags = list(set(tags))
db.set_metadata(i, mi, commit=False, set_title=set_title,
db.set_metadata(book_id, mi, commit=False, set_title=set_title,
set_authors=set_authors, notify=False)
self.applied_ids.append(i)
self.applied_ids.append(book_id)
except:
import traceback
self.apply_failures.append((i, traceback.format_exc()))
self.apply_failures.append((book_id, traceback.format_exc()))
try:
if mi.cover:
@ -521,11 +565,6 @@ class EditMetadataAction(InterfaceAction):
except:
pass
self.apply_current_idx += 1
if self.apply_pd is not None:
self.apply_pd.value += 1
QTimer.singleShot(50, self.do_one_apply)
def finalize_apply(self):
db = self.gui.current_db
db.commit()

View File

@ -113,6 +113,7 @@ class Worker(Thread):
class AutoAdder(QObject):
metadata_read = pyqtSignal(object)
auto_convert = pyqtSignal(object)
def __init__(self, path, parent):
QObject.__init__(self, parent)
@ -124,6 +125,8 @@ class AutoAdder(QObject):
self.metadata_read.connect(self.add_to_db,
type=Qt.QueuedConnection)
QTimer.singleShot(2000, self.initialize)
self.auto_convert.connect(self.do_auto_convert,
type=Qt.QueuedConnection)
elif path:
prints(path,
'is not a valid directory to watch for new ebooks, ignoring')
@ -163,6 +166,7 @@ class AutoAdder(QObject):
needs_rescan = False
duplicates = []
added_ids = set()
for fname, tdir in data.iteritems():
paths = [os.path.join(self.worker.path, fname)]
@ -187,9 +191,12 @@ class AutoAdder(QObject):
continue
mi = [OPF(open(mi, 'rb'), tdir,
populate_spine=False).to_book_metadata()]
dups, num = m.add_books(paths,
dups, ids = m.add_books(paths,
[os.path.splitext(fname)[1][1:].upper()], mi,
add_duplicates=not gprefs['auto_add_check_for_duplicates'])
add_duplicates=not gprefs['auto_add_check_for_duplicates'],
return_ids=True)
added_ids |= set(ids)
num = len(ids)
if dups:
path = dups[0][0]
with open(os.path.join(tdir, 'dup_cache.'+dups[1][0].lower()),
@ -217,8 +224,10 @@ class AutoAdder(QObject):
_('Books with the same title as the following already '
'exist in the database. Add them anyway?'),
'\n'.join(files)):
dups, num = m.add_books(paths, formats, metadata,
add_duplicates=True)
dups, ids = m.add_books(paths, formats, metadata,
add_duplicates=True, return_ids=True)
added_ids |= set(ids)
num = len(ids)
count += num
for tdir in data.itervalues():
@ -227,6 +236,9 @@ class AutoAdder(QObject):
except:
pass
if added_ids and gprefs['auto_add_auto_convert']:
self.auto_convert.emit(added_ids)
if count > 0:
m.books_added(count)
gui.status_bar.show_message(_(
@ -238,4 +250,7 @@ class AutoAdder(QObject):
if needs_rescan:
QTimer.singleShot(2000, self.dir_changed)
def do_auto_convert(self, added_ids):
gui = self.parent()
gui.iactions['Convert Books'].auto_convert_auto_add(added_ids)

View File

@ -160,7 +160,7 @@ class ProceedNotification(MessageBox): # {{{
def __init__(self, callback, payload, html_log, log_viewer_title, title, msg,
det_msg='', show_copy_button=False, parent=None,
cancel_callback=None):
cancel_callback=None, log_is_file=False):
'''
A non modal popup that notifies the user that a background task has
been completed.
@ -175,12 +175,15 @@ class ProceedNotification(MessageBox): # {{{
:param title: The title for this popup
:param msg: The msg to display
:param det_msg: Detailed message
:param log_is_file: If True the html_log parameter is interpreted as
the path to a file on disk containing the log encoded with utf-8
'''
MessageBox.__init__(self, MessageBox.QUESTION, title, msg,
det_msg=det_msg, show_copy_button=show_copy_button,
parent=parent)
self.payload = payload
self.html_log = html_log
self.log_is_file = log_is_file
self.log_viewer_title = log_viewer_title
self.vlb = self.bb.addButton(_('View log'), self.bb.ActionRole)
@ -192,7 +195,11 @@ class ProceedNotification(MessageBox): # {{{
_proceed_memory.append(self)
def show_log(self):
self.log_viewer = ViewLog(self.log_viewer_title, self.html_log,
log = self.html_log
if self.log_is_file:
with open(log, 'rb') as f:
log = f.read().decode('utf-8')
self.log_viewer = ViewLog(self.log_viewer_title, log,
parent=self)
def do_proceed(self, result):
@ -202,9 +209,9 @@ class ProceedNotification(MessageBox): # {{{
gui = get_gui()
gui.proceed_requested.emit(func, self.payload)
# Ensure this notification is garbage collected
self.vlb.clicked.disconnect()
self.callback = self.cancel_callback = self.payload = None
self.setParent(None)
self.vlb.clicked.disconnect()
_proceed_memory.remove(self)
def done(self, r):

View File

@ -140,34 +140,6 @@
</item>
</layout>
</item>
<item>
<widget class="QGroupBox" name="groupBox">
<property name="maximumSize">
<size>
<width>16777215</width>
<height>60</height>
</size>
</property>
<layout class="QHBoxLayout" name="horizontalLayout_5">
<item>
<widget class="QLabel" name="label_51">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Preferred">
<horstretch>40</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string/>
</property>
<property name="buddy">
<cstring>matchkind</cstring>
</property>
</widget>
</item>
</layout>
</widget>
</item>
<item>
<widget class="QLabel" name="label_6">
<property name="maximumSize">

View File

@ -402,7 +402,8 @@ class DetailView(QDialog, Ui_Dialog): # {{{
self.setupUi(self)
self.setWindowTitle(job.description)
self.job = job
self.html_view = hasattr(job, 'html_details')
self.html_view = (hasattr(job, 'html_details') and not getattr(job,
'ignore_html_details', False))
if self.html_view:
self.log.setVisible(False)
else:

View File

@ -187,9 +187,10 @@ class BooksModel(QAbstractTableModel): # {{{
self.db = None
self.reset()
def add_books(self, paths, formats, metadata, add_duplicates=False):
def add_books(self, paths, formats, metadata, add_duplicates=False,
return_ids=False):
ret = self.db.add_books(paths, formats, metadata,
add_duplicates=add_duplicates)
add_duplicates=add_duplicates, return_ids=return_ids)
self.count_changed()
return ret

View File

@ -7,22 +7,41 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, time, shutil
from functools import partial
from itertools import izip
from threading import Event
from PyQt4.Qt import (QIcon, QDialog,
QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt)
from calibre.gui2.threaded_jobs import ThreadedJob
from calibre.ebooks.metadata.sources.identify import identify, msprefs
from calibre.ebooks.metadata.sources.covers import download_cover
from calibre.ebooks.metadata.book.base import Metadata
from calibre.customize.ui import metadata_plugins
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import as_utc
from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.ptempfile import (PersistentTemporaryDirectory,
PersistentTemporaryFile)
# Start download {{{
class Job(ThreadedJob):
ignore_html_details = True
def consolidate_log(self):
self.consolidated_log = self.log.plain_text
self.log = None
def read_consolidated_log(self):
return self.consolidated_log
@property
def details(self):
if self.consolidated_log is None:
return self.log.plain_text
return self.read_consolidated_log()
@property
def log_file(self):
return open(self.download_debug_log, 'rb')
def show_config(gui, parent):
from calibre.gui2.preferences import show_config_widget
show_config_widget('Sharing', 'Metadata download', parent=parent,
@ -104,19 +123,22 @@ def start_download(gui, ids, callback, ensure_fields=None):
d.b.clicked.disconnect()
if ret != d.Accepted:
return
tf = PersistentTemporaryFile('_metadata_bulk_log_')
tf.close()
for batch in split_jobs(ids):
job = ThreadedJob('metadata bulk download',
_('Download metadata for %d books')%len(batch),
download, (batch, gui.current_db, d.identify, d.covers,
ensure_fields), {}, callback)
gui.job_manager.run_threaded_job(job)
job = Job('metadata bulk download',
_('Download metadata for %d books')%len(ids),
download, (ids, tf.name, gui.current_db, d.identify, d.covers,
ensure_fields), {}, callback)
job.download_debug_log = tf.name
gui.job_manager.run_threaded_job(job)
gui.status_bar.show_message(_('Metadata download started'), 3000)
# }}}
def get_job_details(job):
id_map, failed_ids, failed_covers, title_map, all_failed = job.result
(aborted, good_ids, tdir, log_file, failed_ids, failed_covers, title_map,
lm_map, all_failed) = job.result
det_msg = []
for i in failed_ids | failed_covers:
title = title_map[i]
@ -126,92 +148,89 @@ def get_job_details(job):
title += (' ' + _('(Failed cover)'))
det_msg.append(title)
det_msg = '\n'.join(det_msg)
return id_map, failed_ids, failed_covers, all_failed, det_msg
return (aborted, good_ids, tdir, log_file, failed_ids, failed_covers,
all_failed, det_msg, lm_map)
def merge_result(oldmi, newmi, ensure_fields=None):
dummy = Metadata(_('Unknown'))
for f in msprefs['ignore_fields']:
if ':' in f or (ensure_fields and f in ensure_fields):
continue
setattr(newmi, f, getattr(dummy, f))
fields = set()
for plugin in metadata_plugins(['identify']):
fields |= plugin.touched_fields
class HeartBeat(object):
CHECK_INTERVAL = 300 # seconds
''' Check that the file count in tdir changes every five minutes '''
def is_equal(x, y):
if hasattr(x, 'tzinfo'):
x = as_utc(x)
if hasattr(y, 'tzinfo'):
y = as_utc(y)
return x == y
def __init__(self, tdir):
self.tdir = tdir
self.last_count = len(os.listdir(self.tdir))
self.last_time = time.time()
for f in fields:
# Optimize so that set_metadata does not have to do extra work later
if not f.startswith('identifier:'):
if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
getattr(oldmi, f))):
setattr(newmi, f, getattr(dummy, f))
def __call__(self):
if time.time() - self.last_time > self.CHECK_INTERVAL:
c = len(os.listdir(self.tdir))
if c == self.last_count:
return False
self.last_count = c
self.last_time = time.time()
return True
newmi.last_modified = oldmi.last_modified
# Fix log viewer, ratings
# Test: abort, covers only, metadata only, both, 200 entry download, memory
# consumption, all errors and on and on
return newmi
def download(ids, db, do_identify, covers, ensure_fields,
def download(all_ids, tf, db, do_identify, covers, ensure_fields,
log=None, abort=None, notifications=None):
ids = list(ids)
metadata = [db.get_metadata(i, index_is_id=True, get_user_categories=False)
for i in ids]
batch_size = 10
batches = split_jobs(all_ids, batch_size=batch_size)
tdir = PersistentTemporaryDirectory('_metadata_bulk_')
heartbeat = HeartBeat(tdir)
failed_ids = set()
failed_covers = set()
title_map = {}
ans = {}
count = 0
lm_map = {}
ans = set()
all_failed = True
'''
# Test apply dialog
all_failed = do_identify = covers = False
'''
for i, mi in izip(ids, metadata):
aborted = False
count = 0
for ids in batches:
if abort.is_set():
log.error('Aborting...')
break
title, authors, identifiers = mi.title, mi.authors, mi.identifiers
title_map[i] = title
if do_identify:
results = []
try:
results = identify(log, Event(), title=title, authors=authors,
identifiers=identifiers)
except:
pass
if results:
all_failed = False
mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
identifiers = mi.identifiers
if not mi.is_null('rating'):
# set_metadata expects a rating out of 10
mi.rating *= 2
else:
log.error('Failed to download metadata for', title)
failed_ids.add(i)
# We don't want set_metadata operating on anything but covers
mi = merge_result(mi, mi, ensure_fields=ensure_fields)
if covers:
cdata = download_cover(log, title=title, authors=authors,
identifiers=identifiers)
if cdata is not None:
with PersistentTemporaryFile('.jpg', 'downloaded-cover-') as f:
f.write(cdata[-1])
mi.cover = f.name
all_failed = False
else:
failed_covers.add(i)
ans[i] = mi
count += 1
metadata = {i:db.get_metadata(i, index_is_id=True,
get_user_categories=False) for i in ids}
for i in ids:
title_map[i] = metadata[i].title
lm_map[i] = metadata[i].last_modified
metadata = {i:metadata_to_opf(mi, default_lang='und') for i, mi in
metadata.iteritems()}
try:
ret = fork_job('calibre.ebooks.metadata.sources.worker', 'main',
(do_identify, covers, metadata, ensure_fields),
cwd=tdir, abort=abort, heartbeat=heartbeat, no_output=True)
except WorkerError as e:
if e.orig_tb:
raise Exception('Failed to download metadata. Original '
'traceback: \n\n'+e.orig_tb)
raise
count += batch_size
notifications.put((count/len(ids),
_('Downloaded %(num)d of %(tot)d')%dict(num=count, tot=len(ids))))
_('Downloaded %(num)d of %(tot)d')%dict(
num=count, tot=len(all_ids))))
fids, fcovs, allf = ret['result']
if not allf:
all_failed = False
failed_ids = failed_ids.union(fids)
failed_covers = failed_covers.union(fcovs)
ans = ans.union(set(ids) - fids)
for book_id in ids:
lp = os.path.join(tdir, '%d.log'%book_id)
if os.path.exists(lp):
with open(tf, 'ab') as dest, open(lp, 'rb') as src:
dest.write(('\n'+'#'*20 + ' Log for %s '%title_map[book_id] +
'#'*20+'\n').encode('utf-8'))
shutil.copyfileobj(src, dest)
if abort.is_set():
aborted = True
log('Download complete, with %d failures'%len(failed_ids))
return (ans, failed_ids, failed_covers, title_map, all_failed)
return (aborted, ans, tdir, tf, failed_ids, failed_covers, title_map,
lm_map, all_failed)

View File

@ -161,10 +161,10 @@ class MetadataSingleDialogBase(ResizableDialog):
self.manage_authors_button.clicked.connect(self.authors.manage_authors)
self.series = SeriesEdit(self)
self.remove_unused_series_button = QToolButton(self)
self.remove_unused_series_button.setToolTip(
_('Remove unused series (Series that have no books)') )
self.remove_unused_series_button.clicked.connect(self.remove_unused_series)
self.clear_series_button = QToolButton(self)
self.clear_series_button.setToolTip(
_('Clear series') )
self.clear_series_button.clicked.connect(self.series.clear)
self.series_index = SeriesIndexEdit(self, self.series)
self.basic_metadata_widgets.extend([self.series, self.series_index])
@ -198,6 +198,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.basic_metadata_widgets.append(self.identifiers)
self.clear_identifiers_button = QToolButton(self)
self.clear_identifiers_button.setIcon(QIcon(I('trash.png')))
self.clear_identifiers_button.setToolTip(_('Clear Ids'))
self.clear_identifiers_button.clicked.connect(self.identifiers.clear)
self.paste_isbn_button = QToolButton(self)
self.paste_isbn_button.setToolTip('<p>' +
@ -303,17 +304,6 @@ class MetadataSingleDialogBase(ResizableDialog):
self.title_sort.auto_generate()
self.author_sort.auto_generate()
def remove_unused_series(self, *args):
self.db.remove_unused_series()
idx = self.series.current_val
self.series.clear()
self.series.initialize(self.db, self.book_id)
if idx:
for i in range(self.series.count()):
if unicode(self.series.itemText(i)) == idx:
self.series.setCurrentIndex(i)
break
def tags_editor(self, *args):
self.tags.edit(self.db, self.book_id)
@ -591,7 +581,7 @@ class MetadataSingleDialog(MetadataSingleDialogBase): # {{{
sto(self.title_sort, self.authors)
create_row(1, self.authors, self.deduce_author_sort_button, self.author_sort)
sto(self.author_sort, self.series)
create_row(2, self.series, self.remove_unused_series_button,
create_row(2, self.series, self.clear_series_button,
self.series_index, icon='trash.png')
sto(self.series_index, self.swap_title_author_button)
sto(self.swap_title_author_button, self.manage_authors_button)
@ -756,7 +746,7 @@ class MetadataSingleDialogAlt1(MetadataSingleDialogBase): # {{{
span=2, icon='auto_author_sort.png')
create_row(3, self.author_sort, self.series)
create_row(4, self.series, self.series_index,
button=self.remove_unused_series_button, icon='trash.png')
button=self.clear_series_button, icon='trash.png')
create_row(5, self.series_index, self.tags)
create_row(6, self.tags, self.rating, button=self.tags_editor_button)
create_row(7, self.rating, self.pubdate)
@ -892,7 +882,7 @@ class MetadataSingleDialogAlt2(MetadataSingleDialogBase): # {{{
span=2, icon='auto_author_sort.png')
create_row(3, self.author_sort, self.series)
create_row(4, self.series, self.series_index,
button=self.remove_unused_series_button, icon='trash.png')
button=self.clear_series_button, icon='trash.png')
create_row(5, self.series_index, self.tags)
create_row(6, self.tags, self.rating, button=self.tags_editor_button)
create_row(7, self.rating, self.pubdate)

View File

@ -36,6 +36,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
r('new_book_tags', prefs, setting=CommaSeparatedList)
r('auto_add_path', gprefs, restart_required=True)
r('auto_add_check_for_duplicates', gprefs)
r('auto_add_auto_convert', gprefs)
self.filename_pattern = FilenamePattern(self)
self.metadata_box.layout().insertWidget(0, self.filename_pattern)

View File

@ -151,6 +151,19 @@ Author matching is exact.</string>
<string>&amp;Automatic Adding</string>
</attribute>
<layout class="QGridLayout" name="gridLayout_3">
<item row="3" column="0" colspan="2">
<widget class="QCheckBox" name="opt_auto_add_check_for_duplicates">
<property name="toolTip">
<string>If set, this option will causes calibre to check if a file
being auto-added is already in the calibre library.
If it is, a meesage will pop up asking you whether
you want to add it anyway.</string>
</property>
<property name="text">
<string>Check for &amp;duplicates when auto-adding files</string>
</property>
</widget>
</item>
<item row="0" column="0" colspan="2">
<widget class="QLabel" name="label">
<property name="text">
@ -168,7 +181,7 @@ Author matching is exact.</string>
</property>
</widget>
</item>
<item row="4" column="0">
<item row="5" column="0">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Ignore files with the following extensions when automatically adding </string>
@ -187,7 +200,7 @@ Author matching is exact.</string>
</layout>
</widget>
</item>
<item row="4" column="1">
<item row="5" column="1">
<spacer name="horizontalSpacer_2">
<property name="orientation">
<enum>Qt::Horizontal</enum>
@ -225,16 +238,10 @@ Author matching is exact.</string>
</item>
</layout>
</item>
<item row="3" column="0" colspan="2">
<widget class="QCheckBox" name="opt_auto_add_check_for_duplicates">
<property name="toolTip">
<string>If set, this option will causes calibre to check if a file
being auto-added is already in the calibre library.
If it is, a meesage will pop up asking you whether
you want to add it anyway.</string>
</property>
<item row="4" column="0">
<widget class="QCheckBox" name="opt_auto_add_auto_convert">
<property name="text">
<string>Check for &amp;duplicates when auto-adding files</string>
<string>Automatically &amp;convert added files to the current output format</string>
</property>
</widget>
</item>

View File

@ -73,11 +73,13 @@ class OpenSearchOPDSStore(StorePlugin):
type = link.get('type')
if rel and href and type:
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
if 'http://opds-spec.org/thumbnail' in rel:
s.cover_url = href
elif rel == u'http://opds-spec.org/acquisition/buy':
elif 'http://opds-spec.org/image/thumbnail' in rel:
s.cover_url = href
elif 'http://opds-spec.org/acquisition/buy' in rel:
s.detail_item = href
elif rel == u'http://opds-spec.org/acquisition':
elif 'http://opds-spec.org/acquisition' in rel:
if type:
ext = mimetypes.guess_extension(type)
if ext:

View File

@ -25,7 +25,7 @@ from calibre.ebooks.conversion.config import GuiRecommendations, \
from calibre.gui2.convert import bulk_defaults_for_input_format
def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{
out_format=None):
out_format=None, show_no_format_warning=True):
changed = False
jobs = []
bad = []
@ -91,7 +91,7 @@ def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{
except NoSupportedInputFormats:
bad.append(book_id)
if bad != []:
if bad and show_no_format_warning:
res = []
for id in bad:
title = db.title(id, True)

View File

@ -3243,7 +3243,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
return id
def add_books(self, paths, formats, metadata, add_duplicates=True):
def add_books(self, paths, formats, metadata, add_duplicates=True,
return_ids=False):
'''
Add a book to the database. The result cache is not updated.
:param:`paths` List of paths to book files or file-like objects
@ -3289,7 +3290,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
formats = list(duplicate[1] for duplicate in duplicates)
metadata = list(duplicate[2] for duplicate in duplicates)
return (paths, formats, metadata), len(ids)
return None, len(ids)
return None, (ids if return_ids else len(ids))
def import_book(self, mi, formats, notify=True, import_hooks=True,
apply_import_tags=True, preserve_uuid=False):

View File

@ -648,7 +648,10 @@ class BasicNewsRecipe(Recipe):
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
'content' : The full article (can be an empty string). Obsolete
do not use, instead save the content to a temporary
file and pass a file:///path/to/temp/file.html as
the URL.
}
For an example, see the recipe for downloading `The Atlantic`.