Merge from trunk

This commit is contained in:
Charles Haley 2010-12-06 21:28:25 +00:00
commit 0dac0ef3a0
23 changed files with 1535 additions and 315 deletions

View File

@ -4,6 +4,7 @@ __copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
www.mainichi.jp
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class MainichiDailyNews(BasicNewsRecipe):
@ -22,3 +23,18 @@ class MainichiDailyNews(BasicNewsRecipe):
remove_tags = [{'class':"RelatedArticle"}]
remove_tags_after = {'class':"Credit"}
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if re.search(r'pheedo.jp', curarticle.url):
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
return feeds

View File

@ -14,5 +14,19 @@ class MainichiDailyITNews(BasicNewsRecipe):
remove_tags_before = {'class':"NewsTitle"}
remove_tags = [{'class':"RelatedArticle"}]
remove_tags_after = {'class':"Credit"}
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if re.search(r'pheedo.jp', curarticle.url):
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
return feeds remove_tags_after = {'class':"Credit"}

View File

@ -32,12 +32,9 @@ class NikkeiNet_sub_life(BasicNewsRecipe):
remove_tags_after = {'class':"cmn-pr_list"}
feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking')
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special')
]
def get_browser(self):

View File

@ -0,0 +1,102 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
'''
www.nikkei.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
import mechanize
from calibre.ptempfile import PersistentTemporaryFile
class NikkeiNet_sub_life(BasicNewsRecipe):
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)'
__author__ = 'Hiroshi Miura'
description = 'News and current market affairs from Japan'
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
needs_subscription = True
oldest_article = 2
max_articles_per_feed = 20
language = 'ja'
remove_javascript = False
temp_files = []
remove_tags_before = {'class':"cmn-section cmn-indent"}
remove_tags = [
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
{'class':"cmn-article_keyword cmn-clearfix"},
{'class':"cmn-print_headline cmn-clearfix"},
]
remove_tags_after = {'class':"cmn-pr_list"}
feeds = [
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
cj = mechanize.LWPCookieJar()
br.set_cookiejar(cj)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
if self.username is not None and self.password is not None:
#print "----------------------------get login form--------------------------------------------"
# open login form
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
response = br.response()
#print "----------------------------get login form---------------------------------------------"
#print "----------------------------set login form---------------------------------------------"
# remove disabled input which brings error on mechanize
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
br.set_response(response)
br.select_form(name='LA0010Form01')
br['LA0010Form01:LA0010Email'] = self.username
br['LA0010Form01:LA0010Password'] = self.password
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
br.submit()
br.response()
#print "----------------------------send login form---------------------------------------------"
#print "----------------------------open news main page-----------------------------------------"
# open news site
br.open('http://www.nikkei.com/')
br.response()
#print "----------------------------www.nikkei.com BODY --------------------------------------"
#print response2.get_data()
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
# forced redirect in default
br.select_form(nr=0)
br.submit()
response3 = br.response()
# return some cookie which should be set by Javascript
#print response3.geturl()
raw = response3.get_data()
#print "---------------------------response to form --------------------------------------------"
# grab cookie from JS and set it
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
br.select_form(nr=0)
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
self.temp_files[-1].close()
cj.load(self.temp_files[-1].name)
br.submit()
#br.set_debug_http(False)
#br.set_debug_redirects(False)
#br.set_debug_responses(False)
return br

View File

@ -21,7 +21,7 @@ class YOLNews(BasicNewsRecipe):
remove_javascript = True
masthead_title = u'YOMIURI ONLINE'
remove_tags_before = {'class':"article-def"}
keep_only_tags = [{'class':"article-def"}]
remove_tags = [{'class':"RelatedArticle"},
{'class':"sbtns"}
]

View File

@ -21,7 +21,7 @@ class YOLNews(BasicNewsRecipe):
remove_javascript = True
masthead_title = u"YOMIURI ONLINE"
remove_tags_before = {'class':"article-def"}
keep_only_tags = [{'class':"article-def"}]
remove_tags = [{'class':"RelatedArticle"},
{'class':"sbtns"}
]

View File

@ -21,7 +21,7 @@ class ANDROID(USBMS):
# HTC
0x0bb4 : { 0x0c02 : [0x100, 0x0227, 0x0226], 0x0c01 : [0x100, 0x0227], 0x0ff9
: [0x0100, 0x0227, 0x0226], 0x0c87: [0x0100, 0x0227, 0x0226],
0xc92 : [0x100]},
0xc92 : [0x100], 0xc97: [0x226]},
# Eken
0x040d : { 0x8510 : [0x0001] },
@ -63,7 +63,7 @@ class ANDROID(USBMS):
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
'SCH-I500_CARD']
'SCH-I500_CARD', 'SPH-D700_CARD']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID']

View File

@ -11,9 +11,9 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.devices.mime import mime_type_ext
from calibre.devices.interface import BookList as _BookList
from calibre.constants import preferred_encoding
from calibre import isbytestring
from calibre import isbytestring, force_unicode
from calibre.utils.config import prefs, tweaks
from calibre.utils.icu import sort_key, strcmp as icu_strcmp
from calibre.utils.icu import strcmp
class Book(Metadata):
def __init__(self, prefix, lpath, size=None, other=None):
@ -241,7 +241,7 @@ class CollectionsBookList(BookList):
if y is None:
return -1
if isinstance(x, (unicode, str)):
c = strcmp(x, y)
c = strcmp(force_unicode(x), force_unicode(y))
else:
c = cmp(x, y)
if c != 0:

View File

@ -0,0 +1,516 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>'
import sys, textwrap, re, traceback
from urllib import urlencode
from math import ceil
from lxml import html
from lxml.html import soupparser
from calibre.utils.date import parse_date, utcnow, replace_months
from calibre.utils.cleantext import clean_ascii_chars
from calibre import browser, preferred_encoding
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
authors_to_sort_string
from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.utils.config import OptionParser
from calibre.library.comments import sanitize_comments_html
class AmazonFr(MetadataSource):
name = 'Amazon French'
description = _('Downloads metadata from amazon.fr')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Sengian'
version = (1, 0, 0)
has_html_comments = True
def fetch(self):
try:
self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=10, verbose=self.verbose, lang='fr')
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
class AmazonEs(MetadataSource):
name = 'Amazon Spanish'
description = _('Downloads metadata from amazon.com in spanish')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Sengian'
version = (1, 0, 0)
has_html_comments = True
def fetch(self):
try:
self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=10, verbose=self.verbose, lang='es')
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
class AmazonEn(MetadataSource):
name = 'Amazon English'
description = _('Downloads metadata from amazon.com in english')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Sengian'
version = (1, 0, 0)
has_html_comments = True
def fetch(self):
try:
self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=10, verbose=self.verbose, lang='en')
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
class AmazonDe(MetadataSource):
name = 'Amazon German'
description = _('Downloads metadata from amazon.de')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Sengian'
version = (1, 0, 0)
has_html_comments = True
def fetch(self):
try:
self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=10, verbose=self.verbose, lang='de')
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
class Amazon(MetadataSource):
name = 'Amazon'
description = _('Downloads metadata from amazon.com')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Kovid Goyal & Sengian'
version = (1, 1, 0)
has_html_comments = True
def fetch(self):
# if not self.site_customization:
# return
try:
self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=10, verbose=self.verbose, lang='all')
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
# @property
# def string_customization_help(self):
# return _('You can select here the language for metadata search with amazon.com')
def report(verbose):
if verbose:
traceback.print_exc()
class Query(object):
BASE_URL_ALL = 'http://www.amazon.com'
BASE_URL_FR = 'http://www.amazon.fr'
BASE_URL_DE = 'http://www.amazon.de'
def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None,
max_results=20, rlang='all'):
assert not(title is None and author is None and publisher is None \
and isbn is None and keywords is None)
assert (max_results < 21)
self.max_results = int(max_results)
self.renbres = re.compile(u'\s*(\d+)\s*')
q = { 'search-alias' : 'stripbooks' ,
'unfiltered' : '1',
'field-keywords' : '',
'field-author' : '',
'field-title' : '',
'field-isbn' : '',
'field-publisher' : ''
#get to amazon detailed search page to get all options
# 'node' : '',
# 'field-binding' : '',
#before, during, after
# 'field-dateop' : '',
#month as number
# 'field-datemod' : '',
# 'field-dateyear' : '',
#french only
# 'field-collection' : '',
#many options available
}
if rlang =='all':
q['sort'] = 'relevanceexprank'
self.urldata = self.BASE_URL_ALL
elif rlang =='es':
q['sort'] = 'relevanceexprank'
q['field-language'] = 'Spanish'
self.urldata = self.BASE_URL_ALL
elif rlang =='en':
q['sort'] = 'relevanceexprank'
q['field-language'] = 'English'
self.urldata = self.BASE_URL_ALL
elif rlang =='fr':
q['sort'] = 'relevancerank'
self.urldata = self.BASE_URL_FR
elif rlang =='de':
q['sort'] = 'relevancerank'
self.urldata = self.BASE_URL_DE
self.baseurl = self.urldata
if isbn is not None:
q['field-isbn'] = isbn.replace('-', '')
else:
if title is not None:
q['field-title'] = title
if author is not None:
q['field-author'] = author
if publisher is not None:
q['field-publisher'] = publisher
if keywords is not None:
q['field-keywords'] = keywords
if isinstance(q, unicode):
q = q.encode('utf-8')
self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q)
def __call__(self, browser, verbose, timeout = 5.):
if verbose:
print 'Query:', self.urldata
try:
raw = browser.open_novisit(self.urldata, timeout=timeout).read()
except Exception, e:
report(verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
raise
if '<title>404 - ' in raw:
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw))
except:
return None, self.urldata
#nb of page
try:
nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text)
except:
return None, self.urldata
pages =[feed]
if len(nbresults) > 1:
nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1])))
for i in xrange(2, nbpagetoquery + 1):
try:
urldata = self.urldata + '&page=' + str(i)
raw = browser.open_novisit(urldata, timeout=timeout).read()
except Exception, e:
continue
if '<title>404 - ' in raw:
continue
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw))
except:
continue
pages.append(feed)
results = []
for x in pages:
results.extend([i.getparent().get('href') \
for i in x.xpath("//a/span[@class='srTitle']")])
return results[:self.max_results], self.baseurl
class ResultList(list):
def __init__(self, baseurl, lang = 'all'):
self.baseurl = baseurl
self.lang = lang
self.repub = re.compile(u'\((.*)\)')
self.rerat = re.compile(u'([0-9.]+)')
self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>')
self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>')
self.recom = re.compile(r'(?s)<!--.*?-->')
self.republi = re.compile(u'(Editeur|Publisher|Verlag)', re.I)
self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I)
self.relang = re.compile(u'(Language|Langue|Sprache)', re.I)
self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client|Durchschnittliche\s*Kundenbewertung)', re.I)
self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit|Produktinformation)', re.I)
def strip_tags_etree(self, etreeobj, invalid_tags):
for (itag, rmv) in invalid_tags.iteritems():
if rmv:
for elts in etreeobj.getiterator(itag):
elts.drop_tree()
else:
for elts in etreeobj.getiterator(itag):
elts.drop_tag()
def clean_entry(self, entry, invalid_tags = {'script': True},
invalid_id = (), invalid_class=()):
#invalid_tags: remove tag and keep content if False else remove
#remove tags
if invalid_tags:
self.strip_tags_etree(entry, invalid_tags)
#remove id
if invalid_id:
for eltid in invalid_id:
elt = entry.get_element_by_id(eltid)
if elt is not None:
elt.drop_tree()
#remove class
if invalid_class:
for eltclass in invalid_class:
elts = entry.find_class(eltclass)
if elts is not None:
for elt in elts:
elt.drop_tree()
def get_title(self, entry):
title = entry.get_element_by_id('btAsinTitle')
if title is not None:
title = title.text
return unicode(title.replace('\n', '').strip())
def get_authors(self, entry):
author = entry.get_element_by_id('btAsinTitle')
while author.getparent().tag != 'div':
author = author.getparent()
author = author.getparent()
authortext = []
for x in author.getiterator('a'):
authortext.append(unicode(x.text_content().strip()))
return authortext
def get_description(self, entry, verbose):
try:
description = entry.get_element_by_id("productDescription").find("div[@class='content']")
inv_class = ('seeAll', 'emptyClear')
inv_tags ={'img': True, 'a': False}
self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class)
description = html.tostring(description, method='html', encoding=unicode).strip()
# remove all attributes from tags
description = self.reattr.sub(r'<\1>', description)
# Remove the notice about text referring to out of print editions
description = self.reoutp.sub('', description)
# Remove comments
description = self.recom.sub('', description)
return unicode(sanitize_comments_html(description))
except:
report(verbose)
return None
def get_tags(self, entry, browser, verbose):
try:
tags = entry.get_element_by_id('tagContentHolder')
testptag = tags.find_class('see-all')
if testptag:
for x in testptag:
alink = x.xpath('descendant-or-self::a')
if alink:
if alink[0].get('class') == 'tgJsActive':
continue
link = self.baseurl + alink[0].get('href')
entry = self.get_individual_metadata(browser, link, verbose)
tags = entry.get_element_by_id('tagContentHolder')
break
tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
except:
report(verbose)
tags = []
return tags
def get_book_info(self, entry, mi, verbose):
try:
entry = entry.get_element_by_id('SalesRank').getparent()
except:
try:
for z in entry.getiterator('h2'):
if self.reprod.search(z.text_content()):
entry = z.getparent().find("div[@class='content']/ul")
break
except:
report(verbose)
return mi
elts = entry.findall('li')
#pub & date
elt = filter(lambda x: self.republi.search(x.find('b').text), elts)
if elt:
pub = elt[0].find('b').tail
mi.publisher = unicode(self.repub.sub('', pub).strip())
d = self.repub.search(pub)
if d is not None:
d = d.group(1)
try:
default = utcnow().replace(day=15)
if self.lang != 'all':
d = replace_months(d, self.lang)
d = parse_date(d, assume_utc=True, default=default)
mi.pubdate = d
except:
report(verbose)
#ISBN
elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts)
if elt:
isbn = elt[0].find('b').tail.replace('-', '').strip()
if check_isbn(isbn):
mi.isbn = unicode(isbn)
elif len(elt) > 1:
isbn = elt[1].find('b').tail.replace('-', '').strip()
if check_isbn(isbn):
mi.isbn = unicode(isbn)
#Langue
elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
if elt:
langue = elt[0].find('b').tail.strip()
if langue:
mi.language = unicode(langue)
#ratings
elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts)
if elt:
ratings = elt[0].find_class('swSprite')
if ratings:
ratings = self.rerat.findall(ratings[0].get('title'))
if len(ratings) == 2:
mi.rating = float(ratings[0])/float(ratings[1]) * 5
return mi
def fill_MI(self, entry, title, authors, browser, verbose):
mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors)
mi.comments = self.get_description(entry, verbose)
mi = self.get_book_info(entry, mi, verbose)
mi.tags = self.get_tags(entry, browser, verbose)
return mi
def get_individual_metadata(self, browser, linkdata, verbose):
try:
raw = browser.open_novisit(linkdata).read()
except Exception, e:
report(verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
raise
if '<title>404 - ' in raw:
report(verbose)
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
return soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw))
except:
report(verbose)
return
def populate(self, entries, browser, verbose=False):
for x in entries:
try:
entry = self.get_individual_metadata(browser, x, verbose)
# clean results
# inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop')
# inv_class = ('buyingDetailsGrid', 'productImageGrid')
# inv_tags ={'script': True, 'style': True, 'form': False}
# self.clean_entry(entry, invalid_id=inv_ids)
title = self.get_title(entry)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print 'Failed to get all details for an entry'
print e
print 'URL who failed:', x
report(verbose)
continue
self.append(self.fill_MI(entry, title, authors, browser, verbose))
def search(title=None, author=None, publisher=None, isbn=None,
max_results=5, verbose=False, keywords=None, lang='all'):
br = browser()
entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results,rlang=lang)(br, verbose)
if entries is None or len(entries) == 0:
return
#List of entry
ans = ResultList(baseurl, lang)
ans.populate(entries, br, verbose)
return ans
def option_parser():
parser = OptionParser(textwrap.dedent(\
_('''\
%prog [options]
Fetch book metadata from Amazon. You must specify one of title, author,
ISBN, publisher or keywords. Will fetch a maximum of 10 matches,
so you should make your query as specific as possible.
You can chose the language for metadata retrieval:
All & english & french & german & spanish
'''
)))
parser.add_option('-t', '--title', help='Book title')
parser.add_option('-a', '--author', help='Book author(s)')
parser.add_option('-p', '--publisher', help='Book publisher')
parser.add_option('-i', '--isbn', help='Book ISBN')
parser.add_option('-k', '--keywords', help='Keywords')
parser.add_option('-m', '--max-results', default=10,
help='Maximum number of results to fetch')
parser.add_option('-l', '--lang', default='all',
help='Chosen language for metadata search (all, en, fr, es, de)')
parser.add_option('-v', '--verbose', default=0, action='count',
help='Be more verbose about errors')
return parser
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
try:
results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher,
keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results,
lang=opts.lang)
except AssertionError:
report(True)
parser.print_help()
return 1
if results is None or len(results) == 0:
print 'No result found for this search!'
return 0
for result in results:
print unicode(result).encode(preferred_encoding, 'replace')
print
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,390 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>'
__docformat__ = 'restructuredtext en'
import sys, textwrap, re, traceback, socket
from urllib import urlencode
from lxml.html import soupparser, tostring
from calibre import browser, preferred_encoding
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
authors_to_sort_string
from calibre.library.comments import sanitize_comments_html
from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.utils.config import OptionParser
from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars
class Fictionwise(MetadataSource): # {{{
author = 'Sengian'
name = 'Fictionwise'
description = _('Downloads metadata from Fictionwise')
has_html_comments = True
def fetch(self):
try:
self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=10, verbose=self.verbose)
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
# }}}
class FictionwiseError(Exception):
pass
def report(verbose):
if verbose:
traceback.print_exc()
class Query(object):
BASE_URL = 'http://www.fictionwise.com/servlet/mw'
def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20):
assert not(title is None and author is None and publisher is None and keywords is None)
assert (max_results < 21)
self.max_results = int(max_results)
q = { 'template' : 'searchresults_adv.htm' ,
'searchtitle' : '',
'searchauthor' : '',
'searchpublisher' : '',
'searchkeyword' : '',
#possibilities startoflast, fullname, lastfirst
'searchauthortype' : 'startoflast',
'searchcategory' : '',
'searchcategory2' : '',
'searchprice_s' : '0',
'searchprice_e' : 'ANY',
'searchformat' : '',
'searchgeo' : 'US',
'searchfwdatetype' : '',
#maybe use dates fields if needed?
#'sortorder' : 'DESC',
#many options available: b.SortTitle, a.SortName,
#b.DateFirstPublished, b.FWPublishDate
'sortby' : 'b.SortTitle'
}
if title is not None:
q['searchtitle'] = title
if author is not None:
q['searchauthor'] = author
if publisher is not None:
q['searchpublisher'] = publisher
if keywords is not None:
q['searchkeyword'] = keywords
if isinstance(q, unicode):
q = q.encode('utf-8')
self.urldata = urlencode(q)
def __call__(self, browser, verbose, timeout = 5.):
if verbose:
print _('Query: %s') % self.BASE_URL+self.urldata
try:
raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read()
except Exception, e:
report(verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
raise FictionwiseError(_('Fictionwise encountered an error.'))
if '<title>404 - ' in raw:
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
feed = soupparser.fromstring(clean_ascii_chars(raw))
except:
return None
# get list of results as links
results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]")
results = results[:self.max_results]
results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results]
#return feed if no links ie normally a single book or nothing
if not results:
results = [feed]
return results
class ResultList(list):
BASE_URL = 'http://www.fictionwise.com'
COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0}
def __init__(self):
self.retitle = re.compile(r'\[[^\[\]]+\]')
self.rechkauth = re.compile(r'.*book\s*by', re.I)
self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I)
self.repub = re.compile(r'.*publisher\s*:\s*', re.I)
self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I)
self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I)
self.resplitbr = re.compile(r'<br[^>]*>', re.I)
self.recomment = re.compile(r'(?s)<!--.*?-->')
self.reimg = re.compile(r'<img[^>]*>', re.I)
self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I)
self.renbcom = re.compile('(?P<nbcom>\d+)\s*Reader Ratings:')
self.recolor = re.compile('(?P<ncolor>[^/]+).gif')
self.resplitbrdiv = re.compile(r'(<br[^>]+>|</?div[^>]*>)', re.I)
self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I)
def strip_tags_etree(self, etreeobj, invalid_tags):
for (itag, rmv) in invalid_tags.iteritems():
if rmv:
for elts in etreeobj.getiterator(itag):
elts.drop_tree()
else:
for elts in etreeobj.getiterator(itag):
elts.drop_tag()
def clean_entry(self, entry, invalid_tags = {'script': True},
invalid_id = (), invalid_class=(), invalid_xpath = ()):
#invalid_tags: remove tag and keep content if False else remove
#remove tags
if invalid_tags:
self.strip_tags_etree(entry, invalid_tags)
#remove xpath
if invalid_xpath:
for eltid in invalid_xpath:
elt = entry.xpath(eltid)
for el in elt:
el.drop_tree()
#remove id
if invalid_id:
for eltid in invalid_id:
elt = entry.get_element_by_id(eltid)
if elt is not None:
elt.drop_tree()
#remove class
if invalid_class:
for eltclass in invalid_class:
elts = entry.find_class(eltclass)
if elts is not None:
for elt in elts:
elt.drop_tree()
def output_entry(self, entry, prettyout = True, htmlrm="\d+"):
out = tostring(entry, pretty_print=prettyout)
#try to work around tostring to remove this encoding for exemle
reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)')
return reclean.sub('', out)
def get_title(self, entry):
title = entry.findtext('./')
return self.retitle.sub('', title).strip()
def get_authors(self, entry):
authortext = entry.find('./br').tail
if not self.rechkauth.search(authortext):
return []
authortext = self.rechkauth.sub('', authortext)
return [a.strip() for a in authortext.split('&')]
def get_rating(self, entrytable, verbose):
nbcomment = tostring(entrytable.getprevious())
try:
nbcomment = self.renbcom.search(nbcomment).group("nbcom")
except:
report(verbose)
return None
hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")],
float(image.get('height', default=0))) \
for image in entrytable.getiterator('img'))
#ratings as x/5
return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()))
def get_description(self, entry):
description = self.output_entry(entry.xpath('./p')[1],htmlrm="")
description = self.redesc.search(description)
if not description or not description.group("desc"):
return None
#remove invalid tags
description = self.reimg.sub('', description.group("desc"))
description = self.recomment.sub('', description)
description = self.resanitize.sub('', sanitize_comments_html(description))
return _('SUMMARY:\n %s') % re.sub(r'\n\s+</p>','\n</p>', description)
def get_publisher(self, entry):
publisher = self.output_entry(entry.xpath('./p')[1])
publisher = filter(lambda x: self.repub.search(x) is not None,
self.resplitbr.split(publisher))
if not len(publisher):
return None
publisher = self.repub.sub('', publisher[0])
return publisher.split(',')[0].strip()
def get_tags(self, entry):
tag = self.output_entry(entry.xpath('./p')[1])
tag = filter(lambda x: self.retag.search(x) is not None,
self.resplitbr.split(tag))
if not len(tag):
return []
return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/'))
def get_date(self, entry, verbose):
date = self.output_entry(entry.xpath('./p')[1])
date = filter(lambda x: self.redate.search(x) is not None,
self.resplitbr.split(date))
if not len(date):
return None
try:
d = self.redate.sub('', date[0])
if d:
default = utcnow().replace(day=15)
d = parse_date(d, assume_utc=True, default=default)
else:
d = None
except:
report(verbose)
d = None
return d
def get_ISBN(self, entry):
isbns = self.output_entry(entry.xpath('./p')[2])
isbns = filter(lambda x: self.reisbn.search(x) is not None,
self.resplitbrdiv.split(isbns))
if not len(isbns):
return None
isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
def fill_MI(self, entry, title, authors, ratings, verbose):
mi = MetaInformation(title, authors)
mi.rating = ratings
mi.comments = self.get_description(entry)
mi.publisher = self.get_publisher(entry)
mi.tags = self.get_tags(entry)
mi.pubdate = self.get_date(entry, verbose)
mi.isbn = self.get_ISBN(entry)
mi.author_sort = authors_to_sort_string(authors)
return mi
def get_individual_metadata(self, browser, linkdata, verbose):
try:
raw = browser.open_novisit(self.BASE_URL + linkdata).read()
except Exception, e:
report(verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
raise FictionwiseError(_('Fictionwise encountered an error.'))
if '<title>404 - ' in raw:
report(verbose)
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
return soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw))
except:
return None
def populate(self, entries, browser, verbose=False):
inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False,
'ul': False, 'span': False}
inv_xpath =('./table',)
#single entry
if len(entries) == 1 and not isinstance(entries[0], str):
try:
entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
title = self.get_title(entry)
#maybe strenghten the search
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print _('Failed to get all details for an entry')
print e
return
self.append(self.fill_MI(entry, title, authors, ratings, verbose))
else:
#multiple entries
for x in entries:
try:
entry = self.get_individual_metadata(browser, x, verbose)
entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0]
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
title = self.get_title(entry)
#maybe strenghten the search
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print _('Failed to get all details for an entry')
print e
continue
self.append(self.fill_MI(entry, title, authors, ratings, verbose))
def search(title=None, author=None, publisher=None, isbn=None,
min_viewability='none', verbose=False, max_results=5,
keywords=None):
br = browser()
entries = Query(title=title, author=author, publisher=publisher,
keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.)
#List of entry
ans = ResultList()
ans.populate(entries, br, verbose)
return ans
def option_parser():
parser = OptionParser(textwrap.dedent(\
_('''\
%prog [options]
Fetch book metadata from Fictionwise. You must specify one of title, author,
or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches,
so you should make your query as specific as possible.
''')
))
parser.add_option('-t', '--title', help=_('Book title'))
parser.add_option('-a', '--author', help=_('Book author(s)'))
parser.add_option('-p', '--publisher', help=_('Book publisher'))
parser.add_option('-k', '--keywords', help=_('Keywords'))
parser.add_option('-m', '--max-results', default=20,
help=_('Maximum number of results to fetch'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Be more verbose about errors'))
return parser
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
try:
results = search(opts.title, opts.author, publisher=opts.publisher,
keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results)
except AssertionError:
report(True)
parser.print_help()
return 1
if results is None or len(results) == 0:
print _('No result found for this search!')
return 0
for result in results:
print unicode(result).encode(preferred_encoding, 'replace')
print
if __name__ == '__main__':
sys.exit(main())

View File

@ -10,7 +10,8 @@ from copy import deepcopy
from lxml.html import soupparser
from calibre.utils.date import parse_date, utcnow
from calibre.utils.date import parse_date, utcnow, replace_months
from calibre.utils.cleantext import clean_ascii_chars
from calibre import browser, preferred_encoding
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
@ -71,31 +72,16 @@ class NiceBooksCovers(CoverDownload):
traceback.format_exc(), self.name))
class NiceBooksError(Exception):
pass
class ISBNNotFound(NiceBooksError):
pass
def report(verbose):
if verbose:
import traceback
traceback.print_exc()
def replace_monthsfr(datefr):
# Replace french months by english equivalent for parse_date
frtoen = {
u'[jJ]anvier': u'jan',
u'[fF].vrier': u'feb',
u'[mM]ars': u'mar',
u'[aA]vril': u'apr',
u'[mM]ai': u'may',
u'[jJ]uin': u'jun',
u'[jJ]uillet': u'jul',
u'[aA]o.t': u'aug',
u'[sS]eptembre': u'sep',
u'[Oo]ctobre': u'oct',
u'[nN]ovembre': u'nov',
u'[dD].cembre': u'dec' }
for k in frtoen.iterkeys():
tmp = re.sub(k, frtoen[k], datefr)
if tmp <> datefr: break
return tmp
class Query(object):
BASE_URL = 'http://fr.nicebooks.com/'
@ -119,7 +105,7 @@ class Query(object):
def __call__(self, browser, verbose, timeout = 5.):
if verbose:
print 'Query:', self.BASE_URL+self.urldata
print _('Query: %s') % self.BASE_URL+self.urldata
try:
raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
@ -128,7 +114,9 @@ class Query(object):
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
raise
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
raise NiceBooksError(_('Nicebooks encountered an error.'))
if '<title>404 - ' in raw:
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
@ -136,7 +124,11 @@ class Query(object):
try:
feed = soupparser.fromstring(raw)
except:
return
try:
#remove ASCII invalid chars
feed = soupparser.fromstring(clean_ascii_chars(raw))
except:
return None
#nb of page to call
try:
@ -161,7 +153,11 @@ class Query(object):
try:
feed = soupparser.fromstring(raw)
except:
continue
try:
#remove ASCII invalid chars
feed = soupparser.fromstring(clean_ascii_chars(raw))
except:
continue
pages.append(feed)
results = []
@ -180,14 +176,12 @@ class ResultList(list):
self.reautclean = re.compile(u'\s*\(.*\)\s*')
def get_title(self, entry):
# title = deepcopy(entry.find("div[@id='book-info']"))
title = deepcopy(entry)
title.remove(title.find("dl[@title='Informations sur le livre']"))
title = ' '.join([i.text_content() for i in title.iterchildren()])
return unicode(title.replace('\n', ''))
def get_authors(self, entry):
# author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
author = entry.find("dl[@title='Informations sur le livre']")
authortext = []
for x in author.getiterator('dt'):
@ -223,7 +217,7 @@ class ResultList(list):
d = x.getnext().text_content()
try:
default = utcnow().replace(day=15)
d = replace_monthsfr(d)
d = replace_months(d, 'fr')
d = parse_date(d, assume_utc=True, default=default)
mi.pubdate = d
except:
@ -234,11 +228,6 @@ class ResultList(list):
mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors)
mi.comments = self.get_description(entry, verbose)
# entry = entry.find("dl[@title='Informations sur le livre']")
# mi.publisher = self.get_publisher(entry)
# mi.pubdate = self.get_date(entry, verbose)
# mi.isbn = self.get_ISBN(entry)
# mi.language = self.get_language(entry)
return self.get_book_info(entry, mi, verbose)
def get_individual_metadata(self, browser, linkdata, verbose):
@ -249,7 +238,9 @@ class ResultList(list):
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
raise
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
raise NiceBooksError(_('Nicebooks encountered an error.'))
if '<title>404 - ' in raw:
report(verbose)
return
@ -258,7 +249,11 @@ class ResultList(list):
try:
feed = soupparser.fromstring(raw)
except:
return
try:
#remove ASCII invalid chars
feed = soupparser.fromstring(clean_ascii_chars(raw))
except:
return None
# get results
return feed.xpath("//div[@id='container']")[0]
@ -292,13 +287,6 @@ class ResultList(list):
continue
self.append(self.fill_MI(entry, title, authors, verbose))
class NiceBooksError(Exception):
pass
class ISBNNotFound(NiceBooksError):
pass
class Covers(object):
def __init__(self, isbn = None):
@ -329,11 +317,10 @@ class Covers(object):
return cover, ext if ext else 'jpg'
except Exception, err:
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
err = NiceBooksError(_('Nicebooks timed out. Try again later.'))
raise err
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
if not len(self.urlimg):
if not self.isbnf:
raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.'))
raise ISBNNotFound(_('ISBN: %s not found.') % self.isbn)
raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher'))
@ -341,10 +328,10 @@ def search(title=None, author=None, publisher=None, isbn=None,
max_results=5, verbose=False, keywords=None):
br = browser()
entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results)(br, verbose)
keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.)
if entries is None or len(entries) == 0:
return
return None
#List of entry
ans = ResultList()
@ -364,28 +351,28 @@ def cover_from_isbn(isbn, timeout = 5.):
def option_parser():
parser = OptionParser(textwrap.dedent(\
'''\
_('''\
%prog [options]
Fetch book metadata from Nicebooks. You must specify one of title, author,
ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
so you should make your query as specific as possible.
It can also get covers if the option is activated.
'''
''')
))
parser.add_option('-t', '--title', help='Book title')
parser.add_option('-a', '--author', help='Book author(s)')
parser.add_option('-p', '--publisher', help='Book publisher')
parser.add_option('-i', '--isbn', help='Book ISBN')
parser.add_option('-k', '--keywords', help='Keywords')
parser.add_option('-t', '--title', help=_('Book title'))
parser.add_option('-a', '--author', help=_('Book author(s)'))
parser.add_option('-p', '--publisher', help=_('Book publisher'))
parser.add_option('-i', '--isbn', help=_('Book ISBN'))
parser.add_option('-k', '--keywords', help=_('Keywords'))
parser.add_option('-c', '--covers', default=0,
help='Covers: 1-Check/ 2-Download')
help=_('Covers: 1-Check/ 2-Download'))
parser.add_option('-p', '--coverspath', default='',
help='Covers files path')
help=_('Covers files path'))
parser.add_option('-m', '--max-results', default=20,
help='Maximum number of results to fetch')
help=_('Maximum number of results to fetch'))
parser.add_option('-v', '--verbose', default=0, action='count',
help='Be more verbose about errors')
help=_('Be more verbose about errors'))
return parser
def main(args=sys.argv):
@ -400,15 +387,15 @@ def main(args=sys.argv):
parser.print_help()
return 1
if results is None or len(results) == 0:
print 'No result found for this search!'
print _('No result found for this search!')
return 0
for result in results:
print unicode(result).encode(preferred_encoding, 'replace')
covact = int(opts.covers)
if covact == 1:
textcover = 'No cover found!'
textcover = _('No cover found!')
if check_for_cover(result.isbn):
textcover = 'A cover was found for this book'
textcover = _('A cover was found for this book')
print textcover
elif covact == 2:
cover_data, ext = cover_from_isbn(result.isbn)
@ -417,7 +404,7 @@ def main(args=sys.argv):
cpath = os.path.normpath(opts.coverspath + '/' + result.isbn)
oname = os.path.abspath(cpath+'.'+ext)
open(oname, 'wb').write(cover_data)
print 'Cover saved to file ', oname
print _('Cover saved to file '), oname
print
if __name__ == '__main__':

View File

@ -8,12 +8,12 @@ __docformat__ = 'restructuredtext en'
from threading import Thread
from Queue import Empty
import os, time, sys, shutil
import os, time, sys, shutil, json
from calibre.utils.ipc.job import ParallelJob
from calibre.utils.ipc.server import Server
from calibre.ptempfile import PersistentTemporaryDirectory, TemporaryDirectory
from calibre import prints
from calibre import prints, isbytestring
from calibre.constants import filesystem_encoding
@ -194,14 +194,42 @@ class SaveWorker(Thread):
self.daemon = True
self.path, self.opts = path, opts
self.ids = ids
self.library_path = db.library_path
self.db = db
self.canceled = False
self.result_queue = result_queue
self.error = None
self.spare_server = spare_server
self.start()
def collect_data(self, ids):
from calibre.ebooks.metadata.opf2 import metadata_to_opf
data = {}
for i in set(ids):
mi = self.db.get_metadata(i, index_is_id=True, get_cover=True)
opf = metadata_to_opf(mi)
if isbytestring(opf):
opf = opf.decode('utf-8')
cpath = None
if mi.cover:
cpath = mi.cover
if isbytestring(cpath):
cpath = cpath.decode(filesystem_encoding)
formats = {}
if mi.formats:
for fmt in mi.formats:
fpath = self.db.format_abspath(i, fmt, index_is_id=True)
if fpath is not None:
if isbytestring(fpath):
fpath = fpath.decode(filesystem_encoding)
formats[fmt.lower()] = fpath
data[i] = [opf, cpath, formats]
return data
def run(self):
with TemporaryDirectory('save_to_disk_data') as tdir:
self._run(tdir)
def _run(self, tdir):
from calibre.library.save_to_disk import config
server = Server() if self.spare_server is None else self.spare_server
ids = set(self.ids)
@ -212,12 +240,19 @@ class SaveWorker(Thread):
for pref in c.preferences:
recs[pref.name] = getattr(self.opts, pref.name)
plugboards = self.db.prefs.get('plugboards', {})
for i, task in enumerate(tasks):
tids = [x[-1] for x in task]
data = self.collect_data(tids)
dpath = os.path.join(tdir, '%d.json'%i)
with open(dpath, 'wb') as f:
f.write(json.dumps(data, ensure_ascii=False).encode('utf-8'))
job = ParallelJob('save_book',
'Save books (%d of %d)'%(i, len(tasks)),
lambda x,y:x,
args=[tids, self.library_path, self.path, recs])
args=[tids, dpath, plugboards, self.path, recs])
jobs.add(job)
server.add_job(job)
@ -226,21 +261,21 @@ class SaveWorker(Thread):
time.sleep(0.2)
running = False
for job in jobs:
job.update(consume_notifications=False)
while True:
try:
id, title, ok, tb = job.notifications.get_nowait()[0]
if id in ids:
self.result_queue.put((id, title, ok, tb))
ids.remove(id)
except Empty:
break
self.get_notifications(job, ids)
if not job.is_finished:
running = True
if not running:
break
for job in jobs:
if not job.result:
continue
for id_, title, ok, tb in job.result:
if id_ in ids:
self.result_queue.put((id_, title, ok, tb))
ids.remove(id_)
server.close()
time.sleep(1)
@ -257,21 +292,39 @@ class SaveWorker(Thread):
except:
pass
def get_notifications(self, job, ids):
job.update(consume_notifications=False)
while True:
try:
id, title, ok, tb = job.notifications.get_nowait()[0]
if id in ids:
self.result_queue.put((id, title, ok, tb))
ids.remove(id)
except Empty:
break
def save_book(task, library_path, path, recs, notification=lambda x,y:x):
from calibre.library.database2 import LibraryDatabase2
db = LibraryDatabase2(library_path)
from calibre.library.save_to_disk import config, save_to_disk
def save_book(ids, dpath, plugboards, path, recs, notification=lambda x,y:x):
from calibre.library.save_to_disk import config, save_serialized_to_disk
from calibre.customize.ui import apply_null_metadata
opts = config().parse()
for name in recs:
setattr(opts, name, recs[name])
results = []
def callback(id, title, failed, tb):
results.append((id, title, not failed, tb))
notification((id, title, not failed, tb))
return True
with apply_null_metadata:
save_to_disk(db, task, path, opts, callback)
data_ = json.loads(open(dpath, 'rb').read().decode('utf-8'))
data = {}
for k, v in data_.iteritems():
data[int(k)] = v
with apply_null_metadata:
save_serialized_to_disk(ids, data, plugboards, path, opts, callback)
return results

View File

@ -123,6 +123,8 @@ def _config():
help=_('Download social metadata (tags/rating/etc.)'))
c.add_opt('overwrite_author_title_metadata', default=True,
help=_('Overwrite author and title with new metadata'))
c.add_opt('auto_download_cover', default=False,
help=_('Automatically download the cover, if available'))
c.add_opt('enforce_cpu_limit', default=True,
help=_('Limit max simultaneous jobs to number of CPUs'))
c.add_opt('tag_browser_hidden_categories', default=set(),

View File

@ -427,11 +427,27 @@ class Saver(QObject): # {{{
if not self.ids or not self.worker.is_alive():
self.timer.stop()
self.pd.hide()
while self.ids:
before = len(self.ids)
self.get_result()
if before == len(self.ids):
for i in list(self.ids):
self.failures.add(('id:%d'%i, 'Unknown error'))
self.ids.remove(i)
break
if not self.callback_called:
try:
self.worker.join(1.5)
except:
pass # The worker was not yet started
self.callback(self.worker.path, self.failures, self.worker.error)
self.callback_called = True
return
self.get_result()
def get_result(self):
try:
id, title, ok, tb = self.rq.get_nowait()
except Empty:
@ -441,6 +457,7 @@ class Saver(QObject): # {{{
if not isinstance(title, unicode):
title = str(title).decode(preferred_encoding, 'replace')
self.pd.set_msg(_('Saved')+' '+title)
if not ok:
self.failures.add((title, tb))
# }}}

View File

@ -9,7 +9,7 @@ from threading import Thread
from PyQt4.QtCore import Qt, QObject, SIGNAL, QVariant, pyqtSignal, \
QAbstractTableModel, QCoreApplication, QTimer
from PyQt4.QtGui import QDialog, QItemSelectionModel
from PyQt4.QtGui import QDialog, QItemSelectionModel, QIcon
from calibre.gui2.dialogs.fetch_metadata_ui import Ui_FetchMetadata
from calibre.gui2 import error_dialog, NONE, info_dialog, config
@ -42,13 +42,14 @@ class Matches(QAbstractTableModel):
def __init__(self, matches):
self.matches = matches
self.yes_icon = QVariant(QIcon(I('ok.png')))
QAbstractTableModel.__init__(self)
def rowCount(self, *args):
return len(self.matches)
def columnCount(self, *args):
return 6
return 8
def headerData(self, section, orientation, role):
if role != Qt.DisplayRole:
@ -61,6 +62,8 @@ class Matches(QAbstractTableModel):
elif section == 3: text = _("Publisher")
elif section == 4: text = _("ISBN")
elif section == 5: text = _("Published")
elif section == 6: text = _("Has Cover")
elif section == 7: text = _("Has Summary")
return QVariant(text)
else:
@ -71,8 +74,8 @@ class Matches(QAbstractTableModel):
def data(self, index, role):
row, col = index.row(), index.column()
book = self.matches[row]
if role == Qt.DisplayRole:
book = self.matches[row]
res = None
if col == 0:
res = book.title
@ -90,6 +93,11 @@ class Matches(QAbstractTableModel):
if not res:
return NONE
return QVariant(res)
elif role == Qt.DecorationRole:
if col == 6 and book.has_cover:
return self.yes_icon
if col == 7 and book.comments:
return self.yes_icon
return NONE
class FetchMetadata(QDialog, Ui_FetchMetadata):
@ -131,7 +139,7 @@ class FetchMetadata(QDialog, Ui_FetchMetadata):
self.fetch_metadata()
self.opt_get_social_metadata.setChecked(config['get_social_metadata'])
self.opt_overwrite_author_title_metadata.setChecked(config['overwrite_author_title_metadata'])
self.opt_auto_download_cover.setChecked(config['auto_download_cover'])
def show_summary(self, current, *args):
row = current.row()
@ -213,6 +221,12 @@ class FetchMetadata(QDialog, Ui_FetchMetadata):
_hung_fetchers.add(self.fetcher)
if hasattr(self, '_hangcheck') and self._hangcheck.isActive():
self._hangcheck.stop()
# Save value of auto_download_cover, since this is the only place it can
# be set. The values of the other options can be set in
# Preferences->Behavior and should not be set here as they affect bulk
# downloading as well.
if self.opt_auto_download_cover.isChecked() != config['auto_download_cover']:
config.set('auto_download_cover', self.opt_auto_download_cover.isChecked())
def __enter__(self, *args):
return self

View File

@ -9,7 +9,7 @@
<rect>
<x>0</x>
<y>0</y>
<width>830</width>
<width>890</width>
<height>642</height>
</rect>
</property>
@ -109,6 +109,13 @@
</layout>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_overwrite_author_title_metadata">
<property name="text">
<string>Overwrite author and title with author and title of selected book</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_get_social_metadata">
<property name="text">
@ -117,9 +124,9 @@
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_overwrite_author_title_metadata">
<widget class="QCheckBox" name="opt_auto_download_cover">
<property name="text">
<string>Overwrite author and title with author and title of selected book</string>
<string>Automatically download the cover, if available</string>
</property>
</widget>
</item>

View File

@ -760,8 +760,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
if book.publisher: self.publisher.setEditText(book.publisher)
if book.isbn: self.isbn.setText(book.isbn)
if book.pubdate:
d = book.pubdate
self.pubdate.setDate(QDate(d.year, d.month, d.day))
dt = book.pubdate
self.pubdate.setDate(QDate(dt.year, dt.month, dt.day))
summ = book.comments
if summ:
prefix = unicode(self.comments.toPlainText())
@ -777,8 +777,11 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.series.setText(book.series)
if book.series_index is not None:
self.series_index.setValue(book.series_index)
# Needed because of Qt focus bug on OS X
self.fetch_cover_button.setFocus(Qt.OtherFocusReason)
if book.has_cover:
if d.opt_auto_download_cover.isChecked() and book.has_cover:
self.fetch_cover()
else:
self.fetch_cover_button.setFocus(Qt.OtherFocusReason)
else:
error_dialog(self, _('Cannot fetch metadata'),
_('You must specify at least one of ISBN, Title, '

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, traceback, cStringIO, re
import os, traceback, cStringIO, re, shutil
from calibre.constants import DEBUG
from calibre.utils.config import Config, StringConfig, tweaks
@ -203,31 +203,49 @@ def get_components(template, mi, id, timefmt='%b %Y', length=250,
return shorten_components_to(length, components)
def save_book_to_disk(id, db, root, opts, length):
mi = db.get_metadata(id, index_is_id=True)
def save_book_to_disk(id_, db, root, opts, length):
mi = db.get_metadata(id_, index_is_id=True)
cover = db.cover(id_, index_is_id=True, as_path=True)
plugboards = db.prefs.get('plugboards', {})
available_formats = db.formats(id, index_is_id=True)
available_formats = db.formats(id_, index_is_id=True)
if not available_formats:
available_formats = []
else:
available_formats = [x.lower().strip() for x in
available_formats.split(',')]
formats = {}
fmts = db.formats(id_, index_is_id=True, verify_formats=False)
if fmts:
fmts = fmts.split(',')
for fmt in fmts:
fpath = db.format_abspath(id_, fmt, index_is_id=True)
if fpath is not None:
formats[fmt.lower()] = fpath
return do_save_book_to_disk(id_, mi, cover, plugboards,
formats, root, opts, length)
def do_save_book_to_disk(id_, mi, cover, plugboards,
format_map, root, opts, length):
available_formats = [x.lower().strip() for x in format_map.keys()]
if opts.formats == 'all':
asked_formats = available_formats
else:
asked_formats = [x.lower().strip() for x in opts.formats.split(',')]
formats = set(available_formats).intersection(set(asked_formats))
if not formats:
return True, id, mi.title
return True, id_, mi.title
components = get_components(opts.template, mi, id, opts.timefmt, length,
components = get_components(opts.template, mi, id_, opts.timefmt, length,
ascii_filename if opts.asciiize else sanitize_file_name,
to_lowercase=opts.to_lowercase,
replace_whitespace=opts.replace_whitespace)
base_path = os.path.join(root, *components)
base_name = os.path.basename(base_path)
dirpath = os.path.dirname(base_path)
# Don't test for existence first are the test could fail but
# Don't test for existence first as the test could fail but
# another worker process could create the directory before
# the call to makedirs
try:
@ -236,29 +254,23 @@ def save_book_to_disk(id, db, root, opts, length):
if not os.path.exists(dirpath):
raise
cdata = db.cover(id, index_is_id=True)
if opts.save_cover:
if cdata is not None:
with open(base_path+'.jpg', 'wb') as f:
f.write(cdata)
mi.cover = base_name+'.jpg'
else:
mi.cover = None
if opts.save_cover and cover and os.access(cover, os.R_OK):
with open(base_path+'.jpg', 'wb') as f:
with open(cover, 'rb') as s:
shutil.copyfileobj(s, f)
mi.cover = base_name+'.jpg'
else:
mi.cover = None
if opts.write_opf:
opf = metadata_to_opf(mi)
with open(base_path+'.opf', 'wb') as f:
f.write(opf)
if cdata is not None:
mi.cover_data = ('jpg', cdata)
mi.cover = None
written = False
for fmt in formats:
global plugboard_save_to_disk_value, plugboard_any_format_value
dev_name = plugboard_save_to_disk_value
plugboards = db.prefs.get('plugboards', {})
cpb = None
if fmt in plugboards:
cpb = plugboards[fmt]
@ -275,11 +287,12 @@ def save_book_to_disk(id, db, root, opts, length):
# Leave this here for a while, in case problems arise.
if cpb is not None:
prints('Save-to-disk using plugboard:', fmt, cpb)
data = db.format(id, fmt, index_is_id=True)
if data is None:
fp = format_map.get(fmt, None)
if fp is None:
continue
else:
written = True
with open(fp, 'rb') as f:
data = f.read()
written = True
if opts.update_metadata:
stream = cStringIO.StringIO()
stream.write(data)
@ -300,9 +313,21 @@ def save_book_to_disk(id, db, root, opts, length):
with open(fmt_path, 'wb') as f:
f.write(data)
return not written, id, mi.title
return not written, id_, mi.title
def _sanitize_args(root, opts):
if opts is None:
opts = config().parse()
if isinstance(root, unicode):
root = root.encode(filesystem_encoding)
root = os.path.abspath(root)
opts.template = preprocess_template(opts.template)
length = 1000 if supports_long_names(root) else 250
length -= len(root)
if length < 5:
raise ValueError('%r is too long.'%root)
return root, opts, length
def save_to_disk(db, ids, root, opts=None, callback=None):
'''
@ -316,17 +341,7 @@ def save_to_disk(db, ids, root, opts=None, callback=None):
:return: A list of failures. Each element of the list is a tuple
(id, title, traceback)
'''
if opts is None:
opts = config().parse()
if isinstance(root, unicode):
root = root.encode(filesystem_encoding)
root = os.path.abspath(root)
opts.template = preprocess_template(opts.template)
length = 1000 if supports_long_names(root) else 250
length -= len(root)
if length < 5:
raise ValueError('%r is too long.'%root)
root, opts, length = _sanitize_args(root, opts)
failures = []
for x in ids:
tb = ''
@ -343,4 +358,28 @@ def save_to_disk(db, ids, root, opts=None, callback=None):
break
return failures
def save_serialized_to_disk(ids, data, plugboards, root, opts, callback):
from calibre.ebooks.metadata.opf2 import OPF
root, opts, length = _sanitize_args(root, opts)
failures = []
for x in ids:
opf, cover, format_map = data[x]
if isinstance(opf, unicode):
opf = opf.encode('utf-8')
mi = OPF(cStringIO.StringIO(opf)).to_book_metadata()
tb = ''
try:
failed, id, title = do_save_book_to_disk(x, mi, cover, plugboards,
format_map, root, opts, length)
tb = _('Requested formats not available')
except:
failed, id, title = True, x, mi.title
tb = traceback.format_exc()
if failed:
failures.append((id, title, tb))
if callable(callback):
if not callback(int(id), title, failed, tb):
break
return failures

View File

@ -0,0 +1,23 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>'
__docformat__ = 'restructuredtext en'
import re
_ascii_pat = None
def clean_ascii_chars(txt, charlist=None):
'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default'
global _ascii_pat
if _ascii_pat is None:
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \
+ [0x1A, 0x1B]
_ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
if charlist is None:
pat = _ascii_pat
else:
pat = re.compile(u'|'.join(map(unichr, charlist)))
return pat.sub('', txt)

View File

@ -151,3 +151,45 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
format = re.sub('d{1,4}', format_day, format)
format = re.sub('M{1,4}', format_month, format)
return re.sub('yyyy|yy', format_year, format)
def replace_months(datestr, clang):
# Replace months by english equivalent for parse_date
frtoen = {
u'[jJ]anvier': u'jan',
u'[fF].vrier': u'feb',
u'[mM]ars': u'mar',
u'[aA]vril': u'apr',
u'[mM]ai': u'may',
u'[jJ]uin': u'jun',
u'[jJ]uillet': u'jul',
u'[aA]o.t': u'aug',
u'[sS]eptembre': u'sep',
u'[Oo]ctobre': u'oct',
u'[nN]ovembre': u'nov',
u'[dD].cembre': u'dec' }
detoen = {
u'[jJ]anuar': u'jan',
u'[fF]ebruar': u'feb',
u'[mM].rz': u'mar',
u'[aA]pril': u'apr',
u'[mM]ai': u'may',
u'[jJ]uni': u'jun',
u'[jJ]uli': u'jul',
u'[aA]ugust': u'aug',
u'[sS]eptember': u'sep',
u'[Oo]ktober': u'oct',
u'[nN]ovember': u'nov',
u'[dD]ezember': u'dec' }
if clang == 'fr':
dictoen = frtoen
elif clang == 'de':
dictoen = detoen
else:
return datestr
for k in dictoen.iterkeys():
tmp = re.sub(k, dictoen[k], datestr)
if tmp != datestr: break
return tmp

View File

@ -237,8 +237,6 @@ static PyTypeObject icu_CollatorType = { // {{{
// }}
// }}}
// }}}
// Module initialization {{{
@ -286,7 +284,7 @@ icu_upper(PyObject *self, PyObject *args) {
PyMem_Free(input);
return ret;
}
} // }}}
// lower {{{
static PyObject *

View File

@ -56,7 +56,7 @@ def py_sort_key(obj):
def icu_sort_key(collator, obj):
if not obj:
return _none2
return collator.sort_key(obj.lower())
return collator.sort_key(lower(obj))
def py_case_sensitive_sort_key(obj):
if not obj:

View File

@ -1227,7 +1227,7 @@ class ZipFile:
self.fp.flush()
if zinfo.flag_bits & 0x08:
# Write CRC and file sizes after the file data
self.fp.write(struct.pack("<lLL", zinfo.CRC, zinfo.compress_size,
self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
zinfo.file_size))
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo