diff --git a/recipes/high_country_news.recipe b/recipes/high_country_news.recipe index 15db60a957..91602d950b 100644 --- a/recipes/high_country_news.recipe +++ b/recipes/high_country_news.recipe @@ -13,7 +13,7 @@ class HighCountryNews(BasicNewsRecipe): __author__ = 'Armin Geller' # 2012-01-31 publisher = 'High Country News' timefmt = ' [%a, %d %b %Y]' - language = 'en-Us' + language = 'en' encoding = 'UTF-8' publication_type = 'newspaper' oldest_article = 7 diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe index 9dc11059c4..4a9b9e54c3 100644 --- a/recipes/oreilly_premium.recipe +++ b/recipes/oreilly_premium.recipe @@ -1,45 +1,73 @@ -# Talking Points is not grabbing everything. -# The look is right, but only the last one added? -import re +import string, re import time +import traceback +# above for debugging via stack from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup -# strip ads and graphics -# Current Column lacks a title. -# Talking Points Memo - shorten title - Remove year and Bill's name -# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. -# Newsletters: Talking Points Memos covered by cat12 +import os, time, traceback, re, urlparse, sys, cStringIO +from collections import defaultdict +from functools import partial +from contextlib import nested, closing + + +from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed +from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending + + +# To Do: strip ads and graphics, Current Column lacks a title. +# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. +# Newsletters: Talking Points Memos covered by cat12 +# ./ebook-convert --username xxx --password xxx + +# this is derived from BasicNewsRecipe, so it can only overload those. +# Soome of what we need is otherwise in article, so we have more copy to do than otherwise. class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' __author__ = 'TMcN' - language = 'en' description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' + custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y') + title = 'Bill O\'Reilly Premium' auto_cleanup = True + conversion_options = {'linearize_tables': True} encoding = 'utf8' - needs_subscription = True + language = 'en' no_stylesheets = True - oldest_article = 20 + needs_subscription = True + oldest_article = 31 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 - max_articles_per_feed = 2000 - + max_articles_per_feed = 20 + debugMessages = True - + # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], - ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], - ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], - ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], - ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], + # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], + # ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], + # ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], + # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ] - + + feeds = [ + (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'), + (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), + (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'), + (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'), + (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5') + ] + # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day. + + # Note: Talking Points is broken in the above model; the site changed to more Ajax-y. + # Now using RSS + def get_browser(self): + print("In get_browser") br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') @@ -48,7 +76,7 @@ class OReillyPremium(BasicNewsRecipe): br['formPasswordField'] = self.password br.submit() return br - + # Returns the best-guess print url. # The second parameter (pageURL) is returned if nothing is found. def extractPrintURL(self, baseURL, pageURL, printString): @@ -62,17 +90,19 @@ class OReillyPremium(BasicNewsRecipe): tag = printText.parent tagURL = baseURL+tag['href'] return tagURL - + def stripBadChars(self, inString) : return inString.replace("\'", "") - + + def parseGeneric(self, baseURL): - # Does a generic parsing of the articles. There are six categories (0-5) + # Does a generic parsing of the articles. There are six categories (0-5) # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # NoSpin and TV are generic fullReturn = [] - for i in range(len(self.catList)) : + for i in range(len(self.catList)) : articleList = [] + print("In "+self.catList[i][0]+", index: "+ str(i)) soup = self.index_to_soup(self.catList[i][1]) # Set defaults description = 'None' @@ -80,15 +110,13 @@ class OReillyPremium(BasicNewsRecipe): # Problem: 0-2 create many in an array # 3-5 create one. # So no for-div for 3-5 - - if i < 3 : + + if i == 0 : + print("Starting TV Archives") for div in soup.findAll(self.catList[i][2], self.catList[i][3]): + print("Next DIV:") print(div) - if i == 1: - a = div.find('a', href=True) - else : - a = div - print(a) + a = div summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) @@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe): continue # url = baseURL+re.sub(r'\?.*', '', a['href']) url = baseURL+a['href'] - if i < 2 : - url = self.extractPrintURL(baseURL, url, "Print this entry") - title = self.tag_to_string(a, use_alt=True).strip() - elif i == 2 : - # Daily Briefs - url = self.extractPrintURL(baseURL, url, "Print this entry") - title = div.contents[0] - if self.debugMessages : - print(title+" @ "+url) + url = self.extractPrintURL(baseURL, url, "Print this entry") + title = self.tag_to_string(a, use_alt=True).strip() articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) - elif i == 3 : # Stratfor - a = soup.find('a', self.catList[i][3]) - if a is None : - continue - url = baseURL+a['href'] - title = self.tag_to_string(a, use_alt=True).strip() - # Get Stratfor contents so we can get the real title. - stratSoup = self.index_to_soup(url) - title = stratSoup.html.head.title.string - stratIndex = title.find('Stratfor.com:', 0) - if (stratIndex > -1) : - title = title[stratIndex+14:-1] - # Look for first blogBody 2K, it is used as the article. + + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { @@ -182,16 +191,148 @@ class OReillyPremium(BasicNewsRecipe): # 'content' : The full article (can be an empty string). This is used by FullContentProfile # } # this is used instead of BasicNewsRecipe.parse_feeds(). + # it is called by download def parse_index(self): # Parse the page into Python Soup + print("Entering recipe print_index from:") + traceback.print_stack() + print("web") baseURL = "https://www.billoreilly.com" - return self.parseGeneric(baseURL) - + masterList = self.parseGeneric(baseURL) + #print(masterList) + return masterList + def preprocess_html(self, soup): + print("In preprocess_html") refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: return soup content = refresh.get('content').partition('=')[2] raw = self.browser.open('https://www.billoreilly.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace')) + + def build_index(self): + print("In OReilly build_index()\n\n") + feedsRSS = [] + self.report_progress(0, _('Fetching feeds...')) + #try: + feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, + max_articles_per_feed=self.max_articles_per_feed, + log=self.log) + self.report_progress(0, _('Got feeds from index page')) + #except NotImplementedError: + # feeds = self.parse_feeds() + # Now add regular feeds. + feedsRSS = self.parse_feeds() + print ("feedsRSS is type "+feedsRSS.__class__.__name__) + + for articles in feedsRSS: + print("articles is type "+articles.__class__.__name__) + print("Title:" + articles.title) + feeds.append(articles) + if not feeds: + raise ValueError('No articles found, aborting') + + #feeds = FeedCollection(feeds) + + self.report_progress(0, _('Trying to download cover...')) + self.download_cover() + self.report_progress(0, _('Generating masthead...')) + self.masthead_path = None + + try: + murl = self.get_masthead_url() + except: + self.log.exception('Failed to get masthead url') + murl = None + + if murl is not None: + # Try downloading the user-supplied masthead_url + # Failure sets self.masthead_path to None + self.download_masthead(murl) + if self.masthead_path is None: + self.log.info("Synthesizing mastheadImage") + self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg') + try: + self.default_masthead_image(self.masthead_path) + except: + self.log.exception('Failed to generate default masthead image') + self.masthead_path = None + + if self.test: + feeds = feeds[:2] + self.has_single_feed = len(feeds) == 1 + + index = os.path.join(self.output_dir, 'index.html') + + html = self.feeds2index(feeds) + with open(index, 'wb') as fi: + fi.write(html) + + self.jobs = [] + + if self.reverse_article_order: + for feed in feeds: + if hasattr(feed, 'reverse'): + feed.reverse() + + self.feed_objects = feeds + for f, feed in enumerate(feeds): + feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) + if not os.path.isdir(feed_dir): + os.makedirs(feed_dir) + + for a, article in enumerate(feed): + if a >= self.max_articles_per_feed: + break + art_dir = os.path.join(feed_dir, 'article_%d'%a) + if not os.path.isdir(art_dir): + os.makedirs(art_dir) + try: + url = self.print_version(article.url) + except NotImplementedError: + url = article.url + except: + self.log.exception('Failed to find print version for: '+article.url) + url = None + if not url: + continue + func, arg = (self.fetch_embedded_article, article) \ + if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \ + else \ + ((self.fetch_obfuscated_article if self.articles_are_obfuscated \ + else self.fetch_article), url) + req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), + {}, (f, a), self.article_downloaded, + self.error_in_article_download) + req.feed = feed + req.article = article + req.feed_dir = feed_dir + self.jobs.append(req) + + + self.jobs_done = 0 + tp = ThreadPool(self.simultaneous_downloads) + for req in self.jobs: + tp.putRequest(req, block=True, timeout=0) + + + self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads) + while True: + try: + tp.poll() + time.sleep(0.1) + except NoResultsPending: + break + for f, feed in enumerate(feeds): + print("Writing feeds for "+feed.title) + html = self.feed2index(f,feeds) + feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) + with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: + fi.write(html) + self.create_opf(feeds) + self.report_progress(1, _('Feeds downloaded to %s')%index) + + return index + diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe index 19add74fcd..2dfe56d207 100644 --- a/recipes/real_clear.recipe +++ b/recipes/real_clear.recipe @@ -1,7 +1,9 @@ # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug +import string, re import time +from urlparse import urlparse from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import NavigableString +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString class RealClear(BasicNewsRecipe): title = u'Real Clear' @@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe): # Don't go down recursions = 0 max_articles_per_feed = 400 - debugMessages = False - - # Numeric parameter is type, controls whether we look for + debugMessages = True + + # Numeric parameter is type, controls whether we look for feedsets = [ - ["Politics", "http://www.realclearpolitics.com/index.xml", 0], - ["Science", "http://www.realclearscience.com/index.xml", 0], + ["Politics", "http://www.realclearpolitics.com/index.xml", 0], + ["Policy", "http://www.realclearpolicy.com/index.xml", 0], + ["Science", "http://www.realclearscience.com/index.xml", 0], ["Tech", "http://www.realcleartechnology.com/index.xml", 0], # The feedburner is essentially the same as the top feed, politics. # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1], @@ -37,22 +40,37 @@ class RealClear(BasicNewsRecipe): ] # Hints to extractPrintURL. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. - printhints = [ + phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) + + printhints = [ ["realclear", "", '' , 'printpage'], ["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''], - ["politico.com", "Print", 'a', 'share-print'], + ["politico.com", "Print", 'a', 'share-print'], ["nationalreview.com", ">Print<", 'a', ''], ["reason.com", "", 'a', 'printer'] # The following are not supported due to JavaScripting, and would require obfuscated_article to handle - # forbes, + # forbes, # usatoday - just prints with all current crap anyhow - + ] - + # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html + # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s + # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html + # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html + # Use the FULL PRINTPAGE URL; it formats it better too! + # + # NYT - try single page... + # Need special code - is it one page or several? Which URL? + # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1 + # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all + # which is at link rel="canonical" and at 0 and len(self.printhints[x][1]) == 0: + if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0: + # e.g. RealClear if self.debugMessages == True : - print("search1") + print("Search by href: "+self.printhints[x][self.phHrefSearch]) + printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch])) + elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: + if self.debugMessages == True : + print("Search 1: "+self.printhints[x][2]+" Attributes: ") + print(self.printhints[x][3]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) elif len(self.printhints[x][3])>0 : if self.debugMessages == True : print("search2") printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) else : + if self.debugMessages == True: + print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) if printFind is None: if self.debugMessages == True : print("Not Found") + # print(soup) + print("end soup\n\n"); continue + print(printFind) if isinstance(printFind, NavigableString)==False: if printFind['href'] is not None: + print("Check "+printFind['href']+" for base of "+baseURL) + if printFind['href'].find("http")!=0 : + return baseURL+printFind['href'] return printFind['href'] tag = printFind.parent print(tag) @@ -98,7 +130,7 @@ class RealClear(BasicNewsRecipe): print("In get_browser") br = BasicNewsRecipe.get_browser() return br - + def parseRSS(self, index) : if self.debugMessages == True : print("\n\nStarting "+self.feedsets[index][0]) @@ -128,7 +160,7 @@ class RealClear(BasicNewsRecipe): pubDateEl = div.find("pubDate") if pubDateEl is None : pubDateEl = div.find("pubdate") - if pubDateEl is None : + if pubDateEl is None : pubDate = time.strftime('%a, %d %b') else : pubDate = pubDateEl.contents[0] @@ -144,7 +176,7 @@ class RealClear(BasicNewsRecipe): pubdate = time.strftime('%a, %d %b') articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) return articleList - + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { @@ -157,7 +189,8 @@ class RealClear(BasicNewsRecipe): # this is used instead of BasicNewsRecipe.parse_feeds(). def parse_index(self): # Parse the page into Python Soup - + + articleList = [] ans = [] feedsCount = len(self.feedsets) for x in range(0,feedsCount): # should be ,4 @@ -167,4 +200,5 @@ class RealClear(BasicNewsRecipe): if self.debugMessages == True : print(ans) return ans + diff --git a/recipes/soldiers.recipe b/recipes/soldiers.recipe index fb96e5a2ed..a1e9e5ca23 100644 --- a/recipes/soldiers.recipe +++ b/recipes/soldiers.recipe @@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + auto_cleanup = True + auto_cleanup_keep = '//div[@id="mediaWrapper"]' simultaneous_downloads = 1 delay = 4 max_connections = 1 @@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})] + #keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})] - remove_tags = [ - dict(name='div', attrs={'id':['addThis','comment','articleFooter']}) - ,dict(name=['object','link']) - ] + #remove_tags = [ + #dict(name='div', attrs={'id':['addThis','comment','articleFooter']}) + #,dict(name=['object','link']) + #] - feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )] + feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )] def get_cover_url(self): diff --git a/setup/installer/windows/freeze.py b/setup/installer/windows/freeze.py index 3e251d2dcf..c5ea18e2e9 100644 --- a/setup/installer/windows/freeze.py +++ b/setup/installer/windows/freeze.py @@ -14,7 +14,7 @@ from setup.build_environment import msvc, MT, RC from setup.installer.windows.wix import WixMixIn OPENSSL_DIR = r'Q:\openssl' -QT_DIR = 'Q:\\Qt\\4.8.0' +QT_DIR = 'Q:\\Qt\\4.8.1' QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns'] LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll' SW = r'C:\cygwin\home\kovid\sw' diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index ce5a076fdf..e213b50bd2 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -107,6 +107,7 @@ class ANDROID(USBMS): 0xc004 : [0x0226], 0x8801 : [0x0226, 0x0227], 0xe115 : [0x0216], # PocketBook A10 + 0xe107 : [0x326], # PocketBook 622 }, # Acer diff --git a/src/calibre/ebooks/metadata/sources/worker.py b/src/calibre/ebooks/metadata/sources/worker.py new file mode 100644 index 0000000000..f2db60e01f --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/worker.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os +from threading import Event +from io import BytesIO + +from calibre.utils.date import as_utc +from calibre.ebooks.metadata.sources.identify import identify, msprefs +from calibre.ebooks.metadata.book.base import Metadata +from calibre.customize.ui import metadata_plugins +from calibre.ebooks.metadata.sources.covers import download_cover +from calibre.utils.logging import GUILog +from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF + +def merge_result(oldmi, newmi, ensure_fields=None): + dummy = Metadata(_('Unknown')) + for f in msprefs['ignore_fields']: + if ':' in f or (ensure_fields and f in ensure_fields): + continue + setattr(newmi, f, getattr(dummy, f)) + fields = set() + for plugin in metadata_plugins(['identify']): + fields |= plugin.touched_fields + + def is_equal(x, y): + if hasattr(x, 'tzinfo'): + x = as_utc(x) + if hasattr(y, 'tzinfo'): + y = as_utc(y) + return x == y + + for f in fields: + # Optimize so that set_metadata does not have to do extra work later + if not f.startswith('identifier:'): + if (not newmi.is_null(f) and is_equal(getattr(newmi, f), + getattr(oldmi, f))): + setattr(newmi, f, getattr(dummy, f)) + + return newmi + +def main(do_identify, covers, metadata, ensure_fields): + failed_ids = set() + failed_covers = set() + all_failed = True + log = GUILog() + + for book_id, mi in metadata.iteritems(): + mi = OPF(BytesIO(mi), basedir=os.getcwdu(), + populate_spine=False).to_book_metadata() + title, authors, identifiers = mi.title, mi.authors, mi.identifiers + cdata = None + log.clear() + + if do_identify: + results = [] + try: + results = identify(log, Event(), title=title, authors=authors, + identifiers=identifiers) + except: + pass + if results: + all_failed = False + mi = merge_result(mi, results[0], ensure_fields=ensure_fields) + identifiers = mi.identifiers + if not mi.is_null('rating'): + # set_metadata expects a rating out of 10 + mi.rating *= 2 + with open('%d.mi'%book_id, 'wb') as f: + f.write(metadata_to_opf(mi, default_lang='und')) + else: + log.error('Failed to download metadata for', title) + failed_ids.add(book_id) + + if covers: + cdata = download_cover(log, title=title, authors=authors, + identifiers=identifiers) + if cdata is None: + failed_covers.add(book_id) + else: + with open('%d.cover'%book_id, 'wb') as f: + f.write(cdata[-1]) + all_failed = False + + with open('%d.log'%book_id, 'wb') as f: + f.write(log.plain_text.encode('utf-8')) + + return failed_ids, failed_covers, all_failed + diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index d2254e00d8..0ca5341780 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -10,13 +10,19 @@ __docformat__ = 'restructuredtext en' import struct, re, os, imghdr from collections import namedtuple from itertools import repeat +from urlparse import urldefrag + +from lxml import etree from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import read_index from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup from calibre.ebooks.metadata.opf2 import Guide, OPFCreator +from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.mobi.utils import read_font_record +from calibre.ebooks.oeb.parse_utils import parse_html +from calibre.ebooks.oeb.base import XPath, XHTML, xml2text Part = namedtuple('Part', 'num type filename start end aid') @@ -383,6 +389,19 @@ class Mobi8Reader(object): len(resource_map)): mi.cover = resource_map[self.cover_offset] + if len(list(toc)) < 2: + self.log.warn('KF8 has no metadata Table of Contents') + + for ref in guide: + if ref.type == 'toc': + href = ref.href() + href, frag = urldefrag(href) + if os.path.exists(href.replace('/', os.sep)): + try: + toc = self.read_inline_toc(href, frag) + except: + self.log.exception('Failed to read inline ToC') + opf = OPFCreator(os.getcwdu(), mi) opf.guide = guide @@ -397,4 +416,70 @@ class Mobi8Reader(object): opf.render(of, ncx, 'toc.ncx') return 'metadata.opf' + def read_inline_toc(self, href, frag): + ans = TOC() + base_href = '/'.join(href.split('/')[:-1]) + with open(href.replace('/', os.sep), 'rb') as f: + raw = f.read().decode(self.header.codec) + root = parse_html(raw, log=self.log) + body = XPath('//h:body')(root) + reached = False + if body: + start = body[0] + else: + start = None + reached = True + if frag: + elems = XPath('//*[@id="%s"]'%frag) + if elems: + start = elems[0] + + def node_depth(elem): + ans = 0 + parent = elem.getparent() + while parent is not None: + parent = parent.getparent() + ans += 1 + return ans + + # Layer the ToC based on nesting order in the source HTML + current_depth = None + parent = ans + seen = set() + links = [] + for elem in root.iterdescendants(etree.Element): + if reached and elem.tag == XHTML('a') and elem.get('href', + False): + href = elem.get('href') + href, frag = urldefrag(href) + href = base_href + '/' + href + text = xml2text(elem).strip() + if (text, href, frag) in seen: + continue + seen.add((text, href, frag)) + links.append((text, href, frag, node_depth(elem))) + elif elem is start: + reached = True + + depths = sorted(set(x[-1] for x in links)) + depth_map = {x:i for i, x in enumerate(depths)} + for text, href, frag, depth in links: + depth = depth_map[depth] + if current_depth is None: + current_depth = 0 + parent.add_item(href, frag, text) + elif current_depth == depth: + parent.add_item(href, frag, text) + elif current_depth < depth: + parent = parent[-1] if len(parent) > 0 else parent + parent.add_item(href, frag, text) + current_depth += 1 + else: + delta = current_depth - depth + while delta > 0 and parent.parent is not None: + parent = parent.parent + delta -= 1 + parent.add_item(href, frag, text) + current_depth = depth + return ans diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index beeb31f3c5..a680d61188 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -40,27 +40,34 @@ def get_custom_size(opts): custom_size = None return custom_size -def get_pdf_printer(opts, for_comic=False): +def get_pdf_printer(opts, for_comic=False, output_file_name=None): from calibre.gui2 import is_ok_to_use_qt if not is_ok_to_use_qt(): raise Exception('Not OK to use Qt') printer = QPrinter(QPrinter.HighResolution) custom_size = get_custom_size(opts) - - if opts.output_profile.short_name == 'default' or \ - opts.output_profile.width > 9999: - if custom_size is None: - printer.setPaperSize(paper_size(opts.paper_size)) - else: - printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit)) + if isosx and not for_comic: + # On OSX, the native engine can only produce a single page size + # (usually A4). The Qt engine on the other hand produces image based + # PDFs. If we set a custom page size using QSizeF the native engine + # produces unreadable output, so we just ignore the custom size + # settings. + printer.setPaperSize(paper_size(opts.paper_size)) else: - w = opts.output_profile.comic_screen_size[0] if for_comic else \ - opts.output_profile.width - h = opts.output_profile.comic_screen_size[1] if for_comic else \ - opts.output_profile.height - dpi = opts.output_profile.dpi - printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch) + if opts.output_profile.short_name == 'default' or \ + opts.output_profile.width > 9999: + if custom_size is None: + printer.setPaperSize(paper_size(opts.paper_size)) + else: + printer.setPaperSize(QSizeF(custom_size[0], custom_size[1]), unit(opts.unit)) + else: + w = opts.output_profile.comic_screen_size[0] if for_comic else \ + opts.output_profile.width + h = opts.output_profile.comic_screen_size[1] if for_comic else \ + opts.output_profile.height + dpi = opts.output_profile.dpi + printer.setPaperSize(QSizeF(float(w) / dpi, float(h) / dpi), QPrinter.Inch) if for_comic: # Comic pages typically have their own margins, or their background @@ -72,6 +79,12 @@ def get_pdf_printer(opts, for_comic=False): printer.setOrientation(orientation(opts.orientation)) printer.setOutputFormat(QPrinter.PdfFormat) printer.setFullPage(for_comic) + if output_file_name: + printer.setOutputFileName(output_file_name) + if isosx and not for_comic: + # Ensure we are not generating enormous image based PDFs + printer.setOutputFormat(QPrinter.NativeFormat) + return printer def get_printer_page_size(opts, for_comic=False): @@ -163,15 +176,7 @@ class PDFWriter(QObject): # {{{ if ok: item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue)) self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) - printer = get_pdf_printer(self.opts) - printer.setOutputFileName(item_path) - # We have to set the engine to Native on OS X after the call to set - # filename. Setting a filename with .pdf as the extension causes - # Qt to set the format to use Qt's PDF engine even if native was - # previously set on the printer. Qt's PDF engine produces image - # based PDFs on OS X, so we cannot use it. - if isosx: - printer.setOutputFormat(QPrinter.NativeFormat) + printer = get_pdf_printer(self.opts, output_file_name=item_path) self.view.page().mainFrame().evaluateJavaScript(''' document.body.style.backgroundColor = "white"; @@ -193,10 +198,7 @@ class PDFWriter(QObject): # {{{ if self.cover_data is None: return item_path = os.path.join(self.tmp_path, 'cover.pdf') - printer = get_pdf_printer(self.opts) - printer.setOutputFileName(item_path) - if isosx: - printer.setOutputFormat(QPrinter.NativeFormat) + printer = get_pdf_printer(self.opts, output_file_name=item_path) self.combine_queue.insert(0, item_path) p = QPixmap() p.loadFromData(self.cover_data) @@ -248,10 +250,8 @@ class ImagePDFWriter(object): os.remove(f.name) def render_images(self, outpath, mi, items): - printer = get_pdf_printer(self.opts, for_comic=True) - printer.setOutputFileName(outpath) - if isosx: - printer.setOutputFormat(QPrinter.NativeFormat) + printer = get_pdf_printer(self.opts, for_comic=True, + output_file_name=outpath) printer.setDocName(mi.title) printer.setCreator(u'%s [%s]'%(__appname__, __version__)) # Seems to be no way to set author diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index e6d4ccaac0..d334816985 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -105,6 +105,7 @@ gprefs.defaults['show_files_after_save'] = True gprefs.defaults['auto_add_path'] = None gprefs.defaults['auto_add_check_for_duplicates'] = False gprefs.defaults['blocked_auto_formats'] = [] +gprefs.defaults['auto_add_auto_convert'] = True # }}} NONE = QVariant() #: Null value to return from the data function of item models diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index bbdef5b1b5..cf47684063 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -71,7 +71,7 @@ class AddAction(InterfaceAction): ma('add-formats', _('Add files to selected book records'), triggered=self.add_formats, shortcut=_('Shift+A')) self.add_menu.addSeparator() - ma('add-config', _('Configure the adding of books'), + ma('add-config', _('Control the adding of books'), triggered=self.add_config) self.qaction.triggered.connect(self.add_books) diff --git a/src/calibre/gui2/actions/convert.py b/src/calibre/gui2/actions/convert.py index fc1d166685..34e03dc275 100644 --- a/src/calibre/gui2/actions/convert.py +++ b/src/calibre/gui2/actions/convert.py @@ -53,6 +53,24 @@ class ConvertAction(InterfaceAction): self.queue_convert_jobs(jobs, changed, bad, rows, previous, self.book_auto_converted, extra_job_args=[on_card]) + def auto_convert_auto_add(self, book_ids): + previous = self.gui.library_view.currentIndex() + db = self.gui.current_db + needed = set() + of = prefs['output_format'].lower() + for book_id in book_ids: + fmts = db.formats(book_id, index_is_id=True) + fmts = set(x.lower() for x in fmts.split(',')) if fmts else set() + if of not in fmts: + needed.add(book_id) + if needed: + jobs, changed, bad = convert_single_ebook(self.gui, + self.gui.library_view.model().db, needed, True, of, + show_no_format_warning=False) + if not jobs: return + self.queue_convert_jobs(jobs, changed, bad, list(needed), previous, + self.book_converted, rows_are_ids=True) + def auto_convert_mail(self, to, fmts, delete_from_library, book_ids, format, subject): previous = self.gui.library_view.currentIndex() rows = [x.row() for x in \ @@ -118,7 +136,7 @@ class ConvertAction(InterfaceAction): num, 2000) def queue_convert_jobs(self, jobs, changed, bad, rows, previous, - converted_func, extra_job_args=[]): + converted_func, extra_job_args=[], rows_are_ids=False): for func, args, desc, fmt, id, temp_files in jobs: func, _, same_fmt = func.partition(':') same_fmt = same_fmt == 'same_fmt' @@ -140,7 +158,11 @@ class ConvertAction(InterfaceAction): self.conversion_jobs[job] = tuple(args) if changed: - self.gui.library_view.model().refresh_rows(rows) + m = self.gui.library_view.model() + if rows_are_ids: + m.refresh_ids(rows) + else: + m.refresh_rows(rows) current = self.gui.library_view.currentIndex() self.gui.library_view.model().current_changed(current, previous) diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py index 527beae0ab..4a0d12e3d3 100644 --- a/src/calibre/gui2/actions/edit_metadata.py +++ b/src/calibre/gui2/actions/edit_metadata.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os +import os, shutil from functools import partial from PyQt4.Qt import QMenu, QModelIndex, QTimer @@ -16,6 +16,7 @@ from calibre.gui2.dialogs.confirm_delete import confirm from calibre.gui2.dialogs.device_category_editor import DeviceCategoryEditor from calibre.gui2.actions import InterfaceAction from calibre.ebooks.metadata import authors_to_string +from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.icu import sort_key from calibre.db.errors import NoSuchFormat @@ -79,14 +80,23 @@ class EditMetadataAction(InterfaceAction): Dispatcher(self.metadata_downloaded), ensure_fields=ensure_fields) + def cleanup_bulk_download(self, tdir): + try: + shutil.rmtree(tdir, ignore_errors=True) + except: + pass + def metadata_downloaded(self, job): if job.failed: self.gui.job_exception(job, dialog_title=_('Failed to download metadata')) return from calibre.gui2.metadata.bulk_download import get_job_details - id_map, failed_ids, failed_covers, all_failed, det_msg = \ - get_job_details(job) + (aborted, id_map, tdir, log_file, failed_ids, failed_covers, all_failed, + det_msg, lm_map) = get_job_details(job) + if aborted: + return self.cleanup_bulk_download(tdir) if all_failed: + self.cleanup_bulk_download(tdir) return error_dialog(self.gui, _('Download failed'), _('Failed to download metadata or covers for any of the %d' ' book(s).') % len(id_map), det_msg=det_msg, show=True) @@ -103,28 +113,26 @@ class EditMetadataAction(InterfaceAction): msg += '

'+_('Could not download metadata and/or covers for %d of the books. Click' ' "Show details" to see which books.')%num - payload = (id_map, failed_ids, failed_covers) + payload = (id_map, tdir, log_file, lm_map) from calibre.gui2.dialogs.message_box import ProceedNotification p = ProceedNotification(self.apply_downloaded_metadata, - payload, job.html_details, + payload, log_file, _('Download log'), _('Download complete'), msg, det_msg=det_msg, show_copy_button=show_copy_button, - parent=self.gui) + parent=self.gui, log_is_file=True) p.show() def apply_downloaded_metadata(self, payload): - id_map, failed_ids, failed_covers = payload - id_map = dict([(k, v) for k, v in id_map.iteritems() if k not in - failed_ids]) - if not id_map: + good_ids, tdir, log_file, lm_map = payload + if not good_ids: return modified = set() db = self.gui.current_db - for i, mi in id_map.iteritems(): + for i in good_ids: lm = db.metadata_last_modified(i, index_is_id=True) - if lm > mi.last_modified: + if lm > lm_map[i]: title = db.title(i, index_is_id=True) authors = db.authors(i, index_is_id=True) if authors: @@ -144,7 +152,18 @@ class EditMetadataAction(InterfaceAction): 'Do you want to proceed?'), det_msg='\n'.join(modified)): return - self.apply_metadata_changes(id_map) + id_map = {} + for bid in good_ids: + opf = os.path.join(tdir, '%d.mi'%bid) + if not os.path.exists(opf): + opf = None + cov = os.path.join(tdir, '%d.cover'%bid) + if not os.path.exists(cov): + cov = None + id_map[bid] = (opf, cov) + + self.apply_metadata_changes(id_map, callback=lambda x: + self.cleanup_bulk_download(tdir)) # }}} @@ -468,6 +487,11 @@ class EditMetadataAction(InterfaceAction): callback can be either None or a function accepting a single argument, in which case it is called after applying is complete with the list of changed ids. + + id_map can also be a mapping of ids to 2-tuple's where each 2-tuple + contains the absolute paths to an OPF and cover file respectively. If + either of the paths is None, then the corresponding metadata is not + updated. ''' if title is None: title = _('Applying changed metadata') @@ -492,28 +516,48 @@ class EditMetadataAction(InterfaceAction): return self.finalize_apply() i, mi = self.apply_id_map[self.apply_current_idx] + if isinstance(mi, tuple): + opf, cover = mi + if opf: + mi = OPF(open(opf, 'rb'), basedir=os.path.dirname(opf), + populate_spine=False).to_book_metadata() + self.apply_mi(i, mi) + if cover: + self.gui.current_db.set_cover(i, open(cover, 'rb'), + notify=False, commit=False) + else: + self.apply_mi(i, mi) + + self.apply_current_idx += 1 + if self.apply_pd is not None: + self.apply_pd.value += 1 + QTimer.singleShot(50, self.do_one_apply) + + + def apply_mi(self, book_id, mi): db = self.gui.current_db + try: set_title = not mi.is_null('title') set_authors = not mi.is_null('authors') - idents = db.get_identifiers(i, index_is_id=True) + idents = db.get_identifiers(book_id, index_is_id=True) if mi.identifiers: idents.update(mi.identifiers) mi.identifiers = idents if mi.is_null('series'): mi.series_index = None if self._am_merge_tags: - old_tags = db.tags(i, index_is_id=True) + old_tags = db.tags(book_id, index_is_id=True) if old_tags: tags = [x.strip() for x in old_tags.split(',')] + ( mi.tags if mi.tags else []) mi.tags = list(set(tags)) - db.set_metadata(i, mi, commit=False, set_title=set_title, + db.set_metadata(book_id, mi, commit=False, set_title=set_title, set_authors=set_authors, notify=False) - self.applied_ids.append(i) + self.applied_ids.append(book_id) except: import traceback - self.apply_failures.append((i, traceback.format_exc())) + self.apply_failures.append((book_id, traceback.format_exc())) try: if mi.cover: @@ -521,11 +565,6 @@ class EditMetadataAction(InterfaceAction): except: pass - self.apply_current_idx += 1 - if self.apply_pd is not None: - self.apply_pd.value += 1 - QTimer.singleShot(50, self.do_one_apply) - def finalize_apply(self): db = self.gui.current_db db.commit() diff --git a/src/calibre/gui2/auto_add.py b/src/calibre/gui2/auto_add.py index a0be1b72fb..033d7124d5 100644 --- a/src/calibre/gui2/auto_add.py +++ b/src/calibre/gui2/auto_add.py @@ -113,6 +113,7 @@ class Worker(Thread): class AutoAdder(QObject): metadata_read = pyqtSignal(object) + auto_convert = pyqtSignal(object) def __init__(self, path, parent): QObject.__init__(self, parent) @@ -124,6 +125,8 @@ class AutoAdder(QObject): self.metadata_read.connect(self.add_to_db, type=Qt.QueuedConnection) QTimer.singleShot(2000, self.initialize) + self.auto_convert.connect(self.do_auto_convert, + type=Qt.QueuedConnection) elif path: prints(path, 'is not a valid directory to watch for new ebooks, ignoring') @@ -163,6 +166,7 @@ class AutoAdder(QObject): needs_rescan = False duplicates = [] + added_ids = set() for fname, tdir in data.iteritems(): paths = [os.path.join(self.worker.path, fname)] @@ -187,9 +191,12 @@ class AutoAdder(QObject): continue mi = [OPF(open(mi, 'rb'), tdir, populate_spine=False).to_book_metadata()] - dups, num = m.add_books(paths, + dups, ids = m.add_books(paths, [os.path.splitext(fname)[1][1:].upper()], mi, - add_duplicates=not gprefs['auto_add_check_for_duplicates']) + add_duplicates=not gprefs['auto_add_check_for_duplicates'], + return_ids=True) + added_ids |= set(ids) + num = len(ids) if dups: path = dups[0][0] with open(os.path.join(tdir, 'dup_cache.'+dups[1][0].lower()), @@ -217,8 +224,10 @@ class AutoAdder(QObject): _('Books with the same title as the following already ' 'exist in the database. Add them anyway?'), '\n'.join(files)): - dups, num = m.add_books(paths, formats, metadata, - add_duplicates=True) + dups, ids = m.add_books(paths, formats, metadata, + add_duplicates=True, return_ids=True) + added_ids |= set(ids) + num = len(ids) count += num for tdir in data.itervalues(): @@ -227,6 +236,9 @@ class AutoAdder(QObject): except: pass + if added_ids and gprefs['auto_add_auto_convert']: + self.auto_convert.emit(added_ids) + if count > 0: m.books_added(count) gui.status_bar.show_message(_( @@ -238,4 +250,7 @@ class AutoAdder(QObject): if needs_rescan: QTimer.singleShot(2000, self.dir_changed) + def do_auto_convert(self, added_ids): + gui = self.parent() + gui.iactions['Convert Books'].auto_convert_auto_add(added_ids) diff --git a/src/calibre/gui2/dialogs/message_box.py b/src/calibre/gui2/dialogs/message_box.py index cd1e38682e..64c8bf75ba 100644 --- a/src/calibre/gui2/dialogs/message_box.py +++ b/src/calibre/gui2/dialogs/message_box.py @@ -160,7 +160,7 @@ class ProceedNotification(MessageBox): # {{{ def __init__(self, callback, payload, html_log, log_viewer_title, title, msg, det_msg='', show_copy_button=False, parent=None, - cancel_callback=None): + cancel_callback=None, log_is_file=False): ''' A non modal popup that notifies the user that a background task has been completed. @@ -175,12 +175,15 @@ class ProceedNotification(MessageBox): # {{{ :param title: The title for this popup :param msg: The msg to display :param det_msg: Detailed message + :param log_is_file: If True the html_log parameter is interpreted as + the path to a file on disk containing the log encoded with utf-8 ''' MessageBox.__init__(self, MessageBox.QUESTION, title, msg, det_msg=det_msg, show_copy_button=show_copy_button, parent=parent) self.payload = payload self.html_log = html_log + self.log_is_file = log_is_file self.log_viewer_title = log_viewer_title self.vlb = self.bb.addButton(_('View log'), self.bb.ActionRole) @@ -192,7 +195,11 @@ class ProceedNotification(MessageBox): # {{{ _proceed_memory.append(self) def show_log(self): - self.log_viewer = ViewLog(self.log_viewer_title, self.html_log, + log = self.html_log + if self.log_is_file: + with open(log, 'rb') as f: + log = f.read().decode('utf-8') + self.log_viewer = ViewLog(self.log_viewer_title, log, parent=self) def do_proceed(self, result): @@ -202,9 +209,9 @@ class ProceedNotification(MessageBox): # {{{ gui = get_gui() gui.proceed_requested.emit(func, self.payload) # Ensure this notification is garbage collected + self.vlb.clicked.disconnect() self.callback = self.cancel_callback = self.payload = None self.setParent(None) - self.vlb.clicked.disconnect() _proceed_memory.remove(self) def done(self, r): diff --git a/src/calibre/gui2/dialogs/search.ui b/src/calibre/gui2/dialogs/search.ui index f3f96547bd..0a536010ef 100644 --- a/src/calibre/gui2/dialogs/search.ui +++ b/src/calibre/gui2/dialogs/search.ui @@ -140,34 +140,6 @@ - - - - - 16777215 - 60 - - - - - - - - 40 - 0 - - - - - - - matchkind - - - - - - diff --git a/src/calibre/gui2/jobs.py b/src/calibre/gui2/jobs.py index 8c1b5388d7..c0d61332ab 100644 --- a/src/calibre/gui2/jobs.py +++ b/src/calibre/gui2/jobs.py @@ -402,7 +402,8 @@ class DetailView(QDialog, Ui_Dialog): # {{{ self.setupUi(self) self.setWindowTitle(job.description) self.job = job - self.html_view = hasattr(job, 'html_details') + self.html_view = (hasattr(job, 'html_details') and not getattr(job, + 'ignore_html_details', False)) if self.html_view: self.log.setVisible(False) else: diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index 0b3c048a2e..e0047c2a70 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -187,9 +187,10 @@ class BooksModel(QAbstractTableModel): # {{{ self.db = None self.reset() - def add_books(self, paths, formats, metadata, add_duplicates=False): + def add_books(self, paths, formats, metadata, add_duplicates=False, + return_ids=False): ret = self.db.add_books(paths, formats, metadata, - add_duplicates=add_duplicates) + add_duplicates=add_duplicates, return_ids=return_ids) self.count_changed() return ret diff --git a/src/calibre/gui2/metadata/bulk_download.py b/src/calibre/gui2/metadata/bulk_download.py index 976dfad2bb..3487ffd8f2 100644 --- a/src/calibre/gui2/metadata/bulk_download.py +++ b/src/calibre/gui2/metadata/bulk_download.py @@ -7,22 +7,41 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import os, time, shutil from functools import partial -from itertools import izip -from threading import Event from PyQt4.Qt import (QIcon, QDialog, QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt) from calibre.gui2.threaded_jobs import ThreadedJob -from calibre.ebooks.metadata.sources.identify import identify, msprefs -from calibre.ebooks.metadata.sources.covers import download_cover -from calibre.ebooks.metadata.book.base import Metadata -from calibre.customize.ui import metadata_plugins -from calibre.ptempfile import PersistentTemporaryFile -from calibre.utils.date import as_utc +from calibre.ebooks.metadata.opf2 import metadata_to_opf +from calibre.utils.ipc.simple_worker import fork_job, WorkerError +from calibre.ptempfile import (PersistentTemporaryDirectory, + PersistentTemporaryFile) # Start download {{{ + +class Job(ThreadedJob): + + ignore_html_details = True + + def consolidate_log(self): + self.consolidated_log = self.log.plain_text + self.log = None + + def read_consolidated_log(self): + return self.consolidated_log + + @property + def details(self): + if self.consolidated_log is None: + return self.log.plain_text + return self.read_consolidated_log() + + @property + def log_file(self): + return open(self.download_debug_log, 'rb') + def show_config(gui, parent): from calibre.gui2.preferences import show_config_widget show_config_widget('Sharing', 'Metadata download', parent=parent, @@ -104,19 +123,22 @@ def start_download(gui, ids, callback, ensure_fields=None): d.b.clicked.disconnect() if ret != d.Accepted: return + tf = PersistentTemporaryFile('_metadata_bulk_log_') + tf.close() - for batch in split_jobs(ids): - job = ThreadedJob('metadata bulk download', - _('Download metadata for %d books')%len(batch), - download, (batch, gui.current_db, d.identify, d.covers, - ensure_fields), {}, callback) - gui.job_manager.run_threaded_job(job) + job = Job('metadata bulk download', + _('Download metadata for %d books')%len(ids), + download, (ids, tf.name, gui.current_db, d.identify, d.covers, + ensure_fields), {}, callback) + job.download_debug_log = tf.name + gui.job_manager.run_threaded_job(job) gui.status_bar.show_message(_('Metadata download started'), 3000) # }}} def get_job_details(job): - id_map, failed_ids, failed_covers, title_map, all_failed = job.result + (aborted, good_ids, tdir, log_file, failed_ids, failed_covers, title_map, + lm_map, all_failed) = job.result det_msg = [] for i in failed_ids | failed_covers: title = title_map[i] @@ -126,92 +148,89 @@ def get_job_details(job): title += (' ' + _('(Failed cover)')) det_msg.append(title) det_msg = '\n'.join(det_msg) - return id_map, failed_ids, failed_covers, all_failed, det_msg + return (aborted, good_ids, tdir, log_file, failed_ids, failed_covers, + all_failed, det_msg, lm_map) -def merge_result(oldmi, newmi, ensure_fields=None): - dummy = Metadata(_('Unknown')) - for f in msprefs['ignore_fields']: - if ':' in f or (ensure_fields and f in ensure_fields): - continue - setattr(newmi, f, getattr(dummy, f)) - fields = set() - for plugin in metadata_plugins(['identify']): - fields |= plugin.touched_fields +class HeartBeat(object): + CHECK_INTERVAL = 300 # seconds + ''' Check that the file count in tdir changes every five minutes ''' - def is_equal(x, y): - if hasattr(x, 'tzinfo'): - x = as_utc(x) - if hasattr(y, 'tzinfo'): - y = as_utc(y) - return x == y + def __init__(self, tdir): + self.tdir = tdir + self.last_count = len(os.listdir(self.tdir)) + self.last_time = time.time() - for f in fields: - # Optimize so that set_metadata does not have to do extra work later - if not f.startswith('identifier:'): - if (not newmi.is_null(f) and is_equal(getattr(newmi, f), - getattr(oldmi, f))): - setattr(newmi, f, getattr(dummy, f)) + def __call__(self): + if time.time() - self.last_time > self.CHECK_INTERVAL: + c = len(os.listdir(self.tdir)) + if c == self.last_count: + return False + self.last_count = c + self.last_time = time.time() + return True - newmi.last_modified = oldmi.last_modified +# Fix log viewer, ratings +# Test: abort, covers only, metadata only, both, 200 entry download, memory +# consumption, all errors and on and on - return newmi - -def download(ids, db, do_identify, covers, ensure_fields, +def download(all_ids, tf, db, do_identify, covers, ensure_fields, log=None, abort=None, notifications=None): - ids = list(ids) - metadata = [db.get_metadata(i, index_is_id=True, get_user_categories=False) - for i in ids] + batch_size = 10 + batches = split_jobs(all_ids, batch_size=batch_size) + tdir = PersistentTemporaryDirectory('_metadata_bulk_') + heartbeat = HeartBeat(tdir) + failed_ids = set() failed_covers = set() title_map = {} - ans = {} - count = 0 + lm_map = {} + ans = set() all_failed = True - ''' - # Test apply dialog - all_failed = do_identify = covers = False - ''' - for i, mi in izip(ids, metadata): + aborted = False + count = 0 + + for ids in batches: if abort.is_set(): log.error('Aborting...') break - title, authors, identifiers = mi.title, mi.authors, mi.identifiers - title_map[i] = title - if do_identify: - results = [] - try: - results = identify(log, Event(), title=title, authors=authors, - identifiers=identifiers) - except: - pass - if results: - all_failed = False - mi = merge_result(mi, results[0], ensure_fields=ensure_fields) - identifiers = mi.identifiers - if not mi.is_null('rating'): - # set_metadata expects a rating out of 10 - mi.rating *= 2 - else: - log.error('Failed to download metadata for', title) - failed_ids.add(i) - # We don't want set_metadata operating on anything but covers - mi = merge_result(mi, mi, ensure_fields=ensure_fields) - if covers: - cdata = download_cover(log, title=title, authors=authors, - identifiers=identifiers) - if cdata is not None: - with PersistentTemporaryFile('.jpg', 'downloaded-cover-') as f: - f.write(cdata[-1]) - mi.cover = f.name - all_failed = False - else: - failed_covers.add(i) - ans[i] = mi - count += 1 + metadata = {i:db.get_metadata(i, index_is_id=True, + get_user_categories=False) for i in ids} + for i in ids: + title_map[i] = metadata[i].title + lm_map[i] = metadata[i].last_modified + metadata = {i:metadata_to_opf(mi, default_lang='und') for i, mi in + metadata.iteritems()} + try: + ret = fork_job('calibre.ebooks.metadata.sources.worker', 'main', + (do_identify, covers, metadata, ensure_fields), + cwd=tdir, abort=abort, heartbeat=heartbeat, no_output=True) + except WorkerError as e: + if e.orig_tb: + raise Exception('Failed to download metadata. Original ' + 'traceback: \n\n'+e.orig_tb) + raise + count += batch_size notifications.put((count/len(ids), - _('Downloaded %(num)d of %(tot)d')%dict(num=count, tot=len(ids)))) + _('Downloaded %(num)d of %(tot)d')%dict( + num=count, tot=len(all_ids)))) + + fids, fcovs, allf = ret['result'] + if not allf: + all_failed = False + failed_ids = failed_ids.union(fids) + failed_covers = failed_covers.union(fcovs) + ans = ans.union(set(ids) - fids) + for book_id in ids: + lp = os.path.join(tdir, '%d.log'%book_id) + if os.path.exists(lp): + with open(tf, 'ab') as dest, open(lp, 'rb') as src: + dest.write(('\n'+'#'*20 + ' Log for %s '%title_map[book_id] + + '#'*20+'\n').encode('utf-8')) + shutil.copyfileobj(src, dest) + + if abort.is_set(): + aborted = True log('Download complete, with %d failures'%len(failed_ids)) - return (ans, failed_ids, failed_covers, title_map, all_failed) - - + return (aborted, ans, tdir, tf, failed_ids, failed_covers, title_map, + lm_map, all_failed) diff --git a/src/calibre/gui2/metadata/single.py b/src/calibre/gui2/metadata/single.py index 840753c706..23728b5901 100644 --- a/src/calibre/gui2/metadata/single.py +++ b/src/calibre/gui2/metadata/single.py @@ -161,10 +161,10 @@ class MetadataSingleDialogBase(ResizableDialog): self.manage_authors_button.clicked.connect(self.authors.manage_authors) self.series = SeriesEdit(self) - self.remove_unused_series_button = QToolButton(self) - self.remove_unused_series_button.setToolTip( - _('Remove unused series (Series that have no books)') ) - self.remove_unused_series_button.clicked.connect(self.remove_unused_series) + self.clear_series_button = QToolButton(self) + self.clear_series_button.setToolTip( + _('Clear series') ) + self.clear_series_button.clicked.connect(self.series.clear) self.series_index = SeriesIndexEdit(self, self.series) self.basic_metadata_widgets.extend([self.series, self.series_index]) @@ -198,6 +198,7 @@ class MetadataSingleDialogBase(ResizableDialog): self.basic_metadata_widgets.append(self.identifiers) self.clear_identifiers_button = QToolButton(self) self.clear_identifiers_button.setIcon(QIcon(I('trash.png'))) + self.clear_identifiers_button.setToolTip(_('Clear Ids')) self.clear_identifiers_button.clicked.connect(self.identifiers.clear) self.paste_isbn_button = QToolButton(self) self.paste_isbn_button.setToolTip('

' + @@ -303,17 +304,6 @@ class MetadataSingleDialogBase(ResizableDialog): self.title_sort.auto_generate() self.author_sort.auto_generate() - def remove_unused_series(self, *args): - self.db.remove_unused_series() - idx = self.series.current_val - self.series.clear() - self.series.initialize(self.db, self.book_id) - if idx: - for i in range(self.series.count()): - if unicode(self.series.itemText(i)) == idx: - self.series.setCurrentIndex(i) - break - def tags_editor(self, *args): self.tags.edit(self.db, self.book_id) @@ -591,7 +581,7 @@ class MetadataSingleDialog(MetadataSingleDialogBase): # {{{ sto(self.title_sort, self.authors) create_row(1, self.authors, self.deduce_author_sort_button, self.author_sort) sto(self.author_sort, self.series) - create_row(2, self.series, self.remove_unused_series_button, + create_row(2, self.series, self.clear_series_button, self.series_index, icon='trash.png') sto(self.series_index, self.swap_title_author_button) sto(self.swap_title_author_button, self.manage_authors_button) @@ -756,7 +746,7 @@ class MetadataSingleDialogAlt1(MetadataSingleDialogBase): # {{{ span=2, icon='auto_author_sort.png') create_row(3, self.author_sort, self.series) create_row(4, self.series, self.series_index, - button=self.remove_unused_series_button, icon='trash.png') + button=self.clear_series_button, icon='trash.png') create_row(5, self.series_index, self.tags) create_row(6, self.tags, self.rating, button=self.tags_editor_button) create_row(7, self.rating, self.pubdate) @@ -892,7 +882,7 @@ class MetadataSingleDialogAlt2(MetadataSingleDialogBase): # {{{ span=2, icon='auto_author_sort.png') create_row(3, self.author_sort, self.series) create_row(4, self.series, self.series_index, - button=self.remove_unused_series_button, icon='trash.png') + button=self.clear_series_button, icon='trash.png') create_row(5, self.series_index, self.tags) create_row(6, self.tags, self.rating, button=self.tags_editor_button) create_row(7, self.rating, self.pubdate) diff --git a/src/calibre/gui2/preferences/adding.py b/src/calibre/gui2/preferences/adding.py index 1e8395b4f3..fafc5b5a1c 100644 --- a/src/calibre/gui2/preferences/adding.py +++ b/src/calibre/gui2/preferences/adding.py @@ -36,6 +36,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): r('new_book_tags', prefs, setting=CommaSeparatedList) r('auto_add_path', gprefs, restart_required=True) r('auto_add_check_for_duplicates', gprefs) + r('auto_add_auto_convert', gprefs) self.filename_pattern = FilenamePattern(self) self.metadata_box.layout().insertWidget(0, self.filename_pattern) diff --git a/src/calibre/gui2/preferences/adding.ui b/src/calibre/gui2/preferences/adding.ui index 900ed62103..f04d55ff28 100644 --- a/src/calibre/gui2/preferences/adding.ui +++ b/src/calibre/gui2/preferences/adding.ui @@ -151,6 +151,19 @@ Author matching is exact. &Automatic Adding + + + + If set, this option will causes calibre to check if a file + being auto-added is already in the calibre library. + If it is, a meesage will pop up asking you whether + you want to add it anyway. + + + Check for &duplicates when auto-adding files + + + @@ -168,7 +181,7 @@ Author matching is exact. - + Ignore files with the following extensions when automatically adding @@ -187,7 +200,7 @@ Author matching is exact. - + Qt::Horizontal @@ -225,16 +238,10 @@ Author matching is exact. - - - - If set, this option will causes calibre to check if a file - being auto-added is already in the calibre library. - If it is, a meesage will pop up asking you whether - you want to add it anyway. - + + - Check for &duplicates when auto-adding files + Automatically &convert added files to the current output format diff --git a/src/calibre/gui2/store/opensearch_store.py b/src/calibre/gui2/store/opensearch_store.py index bcc92b25f1..a66418aa77 100644 --- a/src/calibre/gui2/store/opensearch_store.py +++ b/src/calibre/gui2/store/opensearch_store.py @@ -73,11 +73,13 @@ class OpenSearchOPDSStore(StorePlugin): type = link.get('type') if rel and href and type: - if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): + if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href - elif rel == u'http://opds-spec.org/acquisition/buy': + elif 'http://opds-spec.org/image/thumbnail' in rel: + s.cover_url = href + elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href - elif rel == u'http://opds-spec.org/acquisition': + elif 'http://opds-spec.org/acquisition' in rel: if type: ext = mimetypes.guess_extension(type) if ext: diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index f1df707ad4..242cac5d79 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -25,7 +25,7 @@ from calibre.ebooks.conversion.config import GuiRecommendations, \ from calibre.gui2.convert import bulk_defaults_for_input_format def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{ - out_format=None): + out_format=None, show_no_format_warning=True): changed = False jobs = [] bad = [] @@ -91,7 +91,7 @@ def convert_single_ebook(parent, db, book_ids, auto_conversion=False, # {{{ except NoSupportedInputFormats: bad.append(book_id) - if bad != []: + if bad and show_no_format_warning: res = [] for id in bad: title = db.title(id, True) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 1b4e8390f1..72ff9cd08d 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -3243,7 +3243,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): return id - def add_books(self, paths, formats, metadata, add_duplicates=True): + def add_books(self, paths, formats, metadata, add_duplicates=True, + return_ids=False): ''' Add a book to the database. The result cache is not updated. :param:`paths` List of paths to book files or file-like objects @@ -3289,7 +3290,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): formats = list(duplicate[1] for duplicate in duplicates) metadata = list(duplicate[2] for duplicate in duplicates) return (paths, formats, metadata), len(ids) - return None, len(ids) + return None, (ids if return_ids else len(ids)) def import_book(self, mi, formats, notify=True, import_hooks=True, apply_import_tags=True, preserve_uuid=False): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 77428e4c07..75e9d03d6e 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -648,7 +648,10 @@ class BasicNewsRecipe(Recipe): 'url' : URL of print version, 'date' : The publication date of the article as a string, 'description' : A summary of the article - 'content' : The full article (can be an empty string). This is used by FullContentProfile + 'content' : The full article (can be an empty string). Obsolete + do not use, instead save the content to a temporary + file and pass a file:///path/to/temp/file.html as + the URL. } For an example, see the recipe for downloading `The Atlantic`.