Merge branch 'kovidgoyal/master'

This commit is contained in:
Charles Haley 2013-07-10 17:44:59 +02:00
commit a04c4159f7
20 changed files with 505 additions and 154 deletions

View File

@ -772,9 +772,11 @@ size. By default, |app| uses a page size defined by the current
:guilabel:`Output profile`. So if your output profile is set to Kindle, |app| :guilabel:`Output profile`. So if your output profile is set to Kindle, |app|
will create a PDF with page size suitable for viewing on the small kindle will create a PDF with page size suitable for viewing on the small kindle
screen. However, if you view this PDF file on a computer screen, then it will screen. However, if you view this PDF file on a computer screen, then it will
appear to have too large fonts. To create "normal" sized PDFs, use the override appear to have too large fonts. To create "normal" sized PDFs, use the
page size option under :guilabel:`PDF Output` in the conversion dialog. :guilabel:`Override page size` option under :guilabel:`PDF Output` in the conversion dialog.
Headers and Footers
^^^^^^^^^^^^^^^^^^^^
You can insert arbitrary headers and footers on each page of the PDF by You can insert arbitrary headers and footers on each page of the PDF by
specifying header and footer templates. Templates are just snippets of HTML specifying header and footer templates. Templates are just snippets of HTML
code that get rendered in the header and footer locations. For example, to code that get rendered in the header and footer locations. For example, to
@ -813,6 +815,9 @@ the page will be used.
bottom margins to large enough values, under the Page Setup section of the bottom margins to large enough values, under the Page Setup section of the
conversion dialog. conversion dialog.
Printable Table of Contents
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You can also insert a printable Table of Contents at the end of the PDF that You can also insert a printable Table of Contents at the end of the PDF that
lists the page numbers for every section. This is very useful if you intend to lists the page numbers for every section. This is very useful if you intend to
print out the PDF to paper. If you wish to use the PDF on an electronic device, print out the PDF to paper. If you wish to use the PDF on an electronic device,

View File

@ -776,6 +776,31 @@ The only way to find the culprit is to eliminate the programs one by one and
see which one is causing the issue. Basically, stop a program, run calibre, see which one is causing the issue. Basically, stop a program, run calibre,
check for crashes. If they still happen, stop another program and repeat. check for crashes. If they still happen, stop another program and repeat.
Using the viewer or doing any conversions results in a permission denied error on windows
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Something on your computer is preventing calibre from accessing its own
temporary files. Most likely the permissions on your Temp folder are incorrect.
Go to the folder file:`C:\\Users\\USERNAME\\AppData\\Local` in Windows
Explorer and then right click on the file:`Temp` folder, select Properties and go to
the Security tab. Make sure that your user account has full control for this
folder.
Some users have reported that running the following command in an Administrator
Command Prompt fixed their permissions. To get an Administrator Command Prompt
search for cmd.exe in the start menu, then right click on the command prompt
entry and select Run as Administrator. At the command prompt type the following
command and press Enter::
icacls "%appdata%\..\Local\Temp" /reset /T
Alternately, you can run calibre as Administrator, but doing so will cause
some functionality, such as drag and drop to not work.
Finally, some users have reported that disabling UAC fixes the problem.
|app| is not starting on OS X? |app| is not starting on OS X?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -1,63 +1,55 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Cracked(BasicNewsRecipe):
title = u'Cracked.com'
__author__ = 'UnWeave'
language = 'en'
description = "America's Only HumorSite since 1958"
publisher = 'Cracked'
category = 'comedy, lists'
oldest_article = 3 #days
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'ascii'
remove_javascript = True
use_embedded_content = False
feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ] class Cracked(BasicNewsRecipe):
title = u'Cracked.com'
__author__ = 'UnWeave'
language = 'en'
description = "America's Only HumorSite since 1958"
publisher = 'Cracked'
category = 'comedy, lists'
oldest_article = 3 # days
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'ascii'
remove_javascript = True
use_embedded_content = False
# auto_cleanup = True
feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]
conversion_options = { conversion_options = {
'comment' : description 'comment': description, 'tags': category, 'publisher': publisher, 'language': language
, 'tags' : category }
, 'publisher' : publisher
, 'language' : language
}
remove_tags_before = dict(id='PrimaryContent') # remove_tags_before = dict(id='PrimaryContent')
remove_tags_after = dict(name='div', attrs={'class':'shareBar'}) keep_only_tags = dict(name='article', attrs={
'class': 'module article dropShadowBottomCurved'})
remove_tags = [ dict(name='div', attrs={'class':['social', # remove_tags_after = dict(name='div', attrs={'class':'shareBar'})
'FacebookLike',
'shareBar'
]}),
dict(name='div', attrs={'id':['inline-share-buttons', remove_tags = [
]}), dict(name='section', attrs={'class': ['socialTools', 'quickFixModule']})]
dict(name='span', attrs={'class':['views',
'KonaFilter'
]}),
#dict(name='img'),
]
def appendPage(self, soup, appendTag, position): def appendPage(self, soup, appendTag, position):
# Check if article has multiple pages # Check if article has multiple pages
pageNav = soup.find('nav', attrs={'class':'PaginationContent'}) pageNav = soup.find('nav', attrs={'class': 'PaginationContent'})
if pageNav: if pageNav:
# Check not at last page # Check not at last page
nextPage = pageNav.find('a', attrs={'class':'next'}) nextPage = pageNav.find('a', attrs={'class': 'next'})
if nextPage: if nextPage:
nextPageURL = nextPage['href'] nextPageURL = nextPage['href']
nextPageSoup = self.index_to_soup(nextPageURL) nextPageSoup = self.index_to_soup(nextPageURL)
# 8th <section> tag contains article content # 8th <section> tag contains article content
nextPageContent = nextPageSoup.findAll('section')[7] nextPageContent = nextPageSoup.findAll('article')[0]
newPosition = len(nextPageContent.contents) newPosition = len(nextPageContent.contents)
self.appendPage(nextPageSoup,nextPageContent,newPosition) self.appendPage(nextPageSoup, nextPageContent, newPosition)
nextPageContent.extract() nextPageContent.extract()
pageNav.extract() pageNav.extract()
appendTag.insert(position,nextPageContent) appendTag.insert(position, nextPageContent)
def preprocess_html(self, soup): def preprocess_html(self, soup):
self.appendPage(soup, soup.body, 3) self.appendPage(soup, soup.body, 3)
return soup return soup

View File

@ -1,41 +1,206 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re, time
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.date import dt_factory, local_tz
from datetime import datetime, timedelta, date
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
class HoustonChronicle(BasicNewsRecipe): class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle' title = u'The Houston Chronicle'
description = 'News from Houston, Texas' description = 'News from Houston, Texas'
__author__ = 'Kovid Goyal' __author__ = 'Dale Furrow'
language = 'en' language = 'en'
timefmt = ' [%a, %d %b, %Y]'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False # use_embedded_content = False
remove_attributes = ['style'] remove_attributes = ['style']
auto_cleanup = True remove_empty_feeds = True
timefmt = '[%a, %d %b %Y]'
oldest_article = 3.0 timestampfmt = '%Y%m%d%H%M%S'
ignore_duplicate_articles = {'url'}
#keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or
#'hst-articletext' in x or 'hst-galleryitem' in x)}
remove_attributes = ['xmlns'] remove_attributes = ['xmlns']
feeds = [ remove_tags = [dict(name='div', attrs={'class':'socialBar'}),
('News', "http://www.chron.com/rss/feed/News-270.php"), dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
('Sports', dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}),
'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'), dict(name='div', attrs={'class':'entry-summary'}),
('Neighborhood', dict(name='a', attrs={'rel':'item-license'})]
'http://www.chron.com/rss/feed/Neighborhood-305.php'),
('Business', 'http://www.chron.com/rss/feed/Business-287.php'), baseUrl = 'http://www.chron.com'
('Entertainment',
'http://www.chron.com/rss/feed/Entertainment-293.php'), oldest_web_article = 7.0
('Editorials',
'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'), if oldest_web_article is None:
('Life', 'http://www.chron.com/rss/feed/Life-297.php'), earliest_date = date.today()
('Science & Tech', else:
'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'), earliest_date = date.today() - timedelta(days=oldest_web_article)
]
pages = [('news' , '/news/houston-texas/'),
('business' , '/business/'),
('opinion', '/opinion/'),
('sports', '/sports/')]
def getLinksFromSectionPage(self, sectionUrl):
pageDoc = html.parse(sectionUrl)
els = pageDoc.xpath("""//div[contains(@class, 'scp-item')
or @class='scp-feature' or contains(@class, 'simplelist')
or contains(@class, 'scp-blogpromo')]
//a[@href and not(@target) and not(child::img)]""")
elList = []
for el in els:
link = el.get('href')
title = el.text
if link[:4] != 'http':
link = self.baseUrl + link
if title is not None:
elList.append((link, el.text))
return elList
def getArticleDescriptionFromDoc(self, pageDoc):
descriptionCharsBreak = 140
descriptionMaxChars = 300
descXpath = """//div[contains(@class, 'article-body') or
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)")
def stringify_children(node):
return ''.join([x for x in node.itertext()])
try:
els = pageDoc.xpath(descXpath)
outText = ""
ellipsis = ""
for el in els:
sentences = re.findall(sentenceRegex, stringify_children(el))
for sentence in sentences:
if len(outText) < descriptionCharsBreak:
outText += sentence + " "
else:
if len(outText) > descriptionMaxChars:
ellipsis = "..."
return outText[:descriptionMaxChars] + ellipsis
return outText
except:
self.log('Error on Article Description')
return ""
def getPublishedTimeFromDoc(self, pageDoc):
regexDateOnly = re.compile("""(?:January|February|March|April|
May|June|July|August|September|October|November|
December)\s[0-9]{1,2},\s20[01][0-9]""")
regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
def getRegularTimestamp(dateString):
try:
outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
return outDate
except:
return None
def getDateFromString(inText):
match = re.findall(regexDateOnly, inText)
if match:
try:
outDate = datetime.strptime(match[0], "%B %d, %Y")
match = re.findall(regextTimeOnly, inText)
if match:
outTime = datetime.strptime(match[0], "%I:%M %p")
return datetime.combine(outDate.date(), outTime.time())
return outDate
except:
return None
else:
return None
el = pageDoc.xpath("//*[@class='timestamp'][1]")
if len(el) == 1:
return getRegularTimestamp(el[0].get('title'))
else:
el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]")
if len(el) == 1:
return getDateFromString(el[0].text_content())
else:
return None
def getAllFeedDataFromPage(self, page):
articles = []
linkList = self.getLinksFromSectionPage(self.baseUrl + page[1])
self.log('from section: ', page[0], " found ", len(linkList), " links")
for link in linkList:
try:
articleDoc = html.parse(link[0])
description = self.getArticleDescriptionFromDoc(articleDoc)
articleDate = self.getPublishedTimeFromDoc(articleDoc)
if articleDate is not None and description is not None and articleDate.date() > self.earliest_date:
dateText = articleDate.strftime('%a, %d %b')
author = articleDate.strftime(self.timestampfmt)
articles.append({'title':link[1], 'url':link[0],
'description':description, 'date':dateText, 'author':author})
self.log(page[0] + ": " + link[1] + ', from ' + dateText +
" description of " + str(len(description)) + ' characters at ' + link[0])
else:
msg = ""
if articleDate is None:
msg = " No Timestamp Found"
else:
msg = " article older than " + str(self.oldest_web_article) + ' days...'
self.log("Skipping article: ", link[0], msg)
except:
print 'error on fetching ' + link[0]
continue
return articles
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = []
for page in self.pages:
articles = []
articles = self.getAllFeedDataFromPage(page)
if articles:
feeds.append((page[0], articles))
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
return feeds
def preprocess_html(self, thisSoup):
baseTags = []
baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')}))
baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'}))
allTags = []
allTags.extend(baseTags)
if len(baseTags) > 0:
for tag in baseTags:
allTags.extend(tag.findAll(True))
paragraphs = thisSoup.findAll(name='p')
for paragraph in paragraphs:
if paragraph not in allTags:
allTags.append(paragraph)
for tag in baseTags:
while tag.parent is not None:
allTags.append(tag)
tag = tag.parent
for tag in thisSoup.findAll(True):
if tag not in allTags:
tag.extract()
return thisSoup
def populate_article_metadata(self, article, soup, first):
if not first:
return
try:
article.date = time.strptime(article.author, self.timestampfmt)
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
article.localtime = article.utctime.astimezone(local_tz)
except Exception as inst: # remove after debug
self.log('Exception: ', article.title) # remove after debug
self.log(type(inst)) # remove after debug
self.log(inst) # remove after debug

View File

@ -1,17 +1,18 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' __copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf.fr>'
''' '''
Mediapart Mediapart
''' '''
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' __author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf.fr>'
import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Mediapart(BasicNewsRecipe): class Mediapart(BasicNewsRecipe):
title = 'Mediapart' title = 'Mediapart'
__author__ = 'Mathieu Godlewski, Louis Gesbert' __author__ = 'Mathieu Godlewski, Louis Gesbert, Malah'
description = 'Global news in french from news site Mediapart' description = 'Global news in french from news site Mediapart'
oldest_article = 7 oldest_article = 7
language = 'fr' language = 'fr'
@ -21,6 +22,7 @@ class Mediapart(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
masthead_url = 'https://upload.wikimedia.org/wikipedia/fr/2/23/Mediapart.png'
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
feeds = [ feeds = [
@ -36,18 +38,18 @@ class Mediapart(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
raw = self.browser.open(url).read() raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace')) soup = BeautifulSoup(raw.decode('utf8', 'replace'))
link = soup.find('a', {'title':'Imprimer'}) link = soup.find('a', {'href':re.compile('^/print/[0-9]+')})
if link is None: if link is None:
return None return None
return link['href'] return 'http://www.mediapart.fr' + link['href']
# -- Handle login # -- Handle login
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('http://www.mediapart.fr/') br.open('http://blogs.mediapart.fr/editions/guide-du-coordonnateur-d-edition')
br.select_form(nr=0) br.select_form(nr=1)
br['name'] = self.username br['name'] = self.username
br['pass'] = self.password br['pass'] = self.password
br.submit() br.submit()

View File

@ -537,3 +537,10 @@ many_libraries = 10
# highlight with this tweak. Set it to 'transparent' to disable highlighting. # highlight with this tweak. Set it to 'transparent' to disable highlighting.
highlight_virtual_library = 'yellow' highlight_virtual_library = 'yellow'
#: Choose available output formats for conversion
# Restrict the list of available output formats in the conversion dialogs.
# For example, if you only want to convert to EPUB and AZW3, change this to
# restrict_output_formats = ['EPUB', 'AZW3']. The default value of None causes
# all available output formats to be present.
restrict_output_formats = None

View File

@ -436,13 +436,21 @@ def fit_image(width, height, pwidth, pheight):
class CurrentDir(object): class CurrentDir(object):
def __init__(self, path): def __init__(self, path, workaround_temp_folder_permissions=False):
self.path = path self.path = path
self.cwd = None self.cwd = None
self.workaround_temp_folder_permissions = workaround_temp_folder_permissions
def __enter__(self, *args): def __enter__(self, *args):
self.cwd = os.getcwdu() self.cwd = os.getcwdu()
os.chdir(self.path) try:
os.chdir(self.path)
except OSError:
if not self.workaround_temp_folder_permissions:
raise
from calibre.ptempfile import reset_temp_folder_permissions
reset_temp_folder_permissions()
os.chdir(self.path)
return self.cwd return self.cwd
def __exit__(self, *args): def __exit__(self, *args):

View File

@ -282,3 +282,8 @@ def get_windows_user_locale_name():
return None return None
return u'_'.join(buf.value.split(u'-')[:2]) return u'_'.join(buf.value.split(u'-')[:2])
def is_modern_webkit():
# Check if we are using QtWebKit >= 2.3
from PyQt4.QtWebKit import qWebKitMajorVersion
return qWebKitMajorVersion() >= 537

View File

@ -233,7 +233,7 @@ class InputFormatPlugin(Plugin):
# In case stdout is broken # In case stdout is broken
pass pass
with CurrentDir(output_dir): with CurrentDir(output_dir, workaround_temp_folder_permissions=True):
for x in os.listdir('.'): for x in os.listdir('.'):
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)

View File

@ -107,7 +107,7 @@ class ANDROID(USBMS):
0x0ff9 : [0x0226], 0x0ff9 : [0x0226],
0xc91 : HTC_BCDS, 0xc91 : HTC_BCDS,
0xdddd : [0x216], 0xdddd : [0x216],
0xdeed : [0x231], 0xdeed : [0x231, 0x226],
}, },
# Samsung # Samsung
@ -241,7 +241,7 @@ class ANDROID(USBMS):
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E', 'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS', 'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS',
'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894', '_USB', 'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894', '_USB',
'PROD_TAB13-201', 'URFPAD2', 'PROD_TAB13-201', 'URFPAD2', 'MID1126',
] ]
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
@ -254,7 +254,7 @@ class ANDROID(USBMS):
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727', 'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E', 'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894', 'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894',
'_USB', 'PROD_TAB13-201', 'URFPAD2' '_USB', 'PROD_TAB13-201', 'URFPAD2', 'MID1126',
] ]
OSX_MAIN_MEM = 'Android Device Main Memory' OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -283,11 +283,17 @@ class CollectionsBookList(BookList):
return -1 return -1
if isinstance(x, basestring) and isinstance(y, basestring): if isinstance(x, basestring) and isinstance(y, basestring):
x, y = sort_key(force_unicode(x)), sort_key(force_unicode(y)) x, y = sort_key(force_unicode(x)), sort_key(force_unicode(y))
c = cmp(x, y) try:
c = cmp(x, y)
except TypeError:
c = 0
if c != 0: if c != 0:
return c return c
# same as above -- no sort_key needed here # same as above -- no sort_key needed here
return cmp(xx[2], yy[2]) try:
return cmp(xx[2], yy[2])
except TypeError:
return 0
for category, lpaths in collections.items(): for category, lpaths in collections.items():
books = lpaths.values() books = lpaths.values()

View File

@ -19,6 +19,11 @@ from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
def CSSSelect(expr):
from cssselect import HTMLTranslator
from lxml.etree import XPath
return XPath(HTMLTranslator().css_to_xpath(expr))
class Worker(Thread): # Get details {{{ class Worker(Thread): # Get details {{{
''' '''
@ -142,6 +147,8 @@ class Worker(Thread): # Get details {{{
starts-with(text(), "Editora:") or \ starts-with(text(), "Editora:") or \
starts-with(text(), "出版社:")] starts-with(text(), "出版社:")]
''' '''
self.publisher_names = {'Publisher', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
self.language_xpath = ''' self.language_xpath = '''
descendant::*[ descendant::*[
starts-with(text(), "Language:") \ starts-with(text(), "Language:") \
@ -153,6 +160,7 @@ class Worker(Thread): # Get details {{{
or starts-with(text(), "言語") \ or starts-with(text(), "言語") \
] ]
''' '''
self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語'}
self.ratings_pat = re.compile( self.ratings_pat = re.compile(
r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de|de) ([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas)){0,1}') r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de|de) ([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas)){0,1}')
@ -310,36 +318,44 @@ class Worker(Thread): # Get details {{{
self.log.exception('Error parsing cover for url: %r'%self.url) self.log.exception('Error parsing cover for url: %r'%self.url)
mi.has_cover = bool(self.cover_url) mi.has_cover = bool(self.cover_url)
pd = root.xpath(self.pd_xpath) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
if pd: if non_hero:
pd = pd[0] # New style markup
try: try:
isbn = self.parse_isbn(pd) self.parse_new_details(root, mi, non_hero[0])
if isbn:
self.isbn = mi.isbn = isbn
except: except:
self.log.exception('Error parsing ISBN for url: %r'%self.url) self.log.exception('Failed to parse new-style book details section')
try:
mi.publisher = self.parse_publisher(pd)
except:
self.log.exception('Error parsing publisher for url: %r'%self.url)
try:
mi.pubdate = self.parse_pubdate(pd)
except:
self.log.exception('Error parsing publish date for url: %r'%self.url)
try:
lang = self.parse_language(pd)
if lang:
mi.language = lang
except:
self.log.exception('Error parsing language for url: %r'%self.url)
else: else:
self.log.warning('Failed to find product description for url: %r'%self.url) pd = root.xpath(self.pd_xpath)
if pd:
pd = pd[0]
try:
isbn = self.parse_isbn(pd)
if isbn:
self.isbn = mi.isbn = isbn
except:
self.log.exception('Error parsing ISBN for url: %r'%self.url)
try:
mi.publisher = self.parse_publisher(pd)
except:
self.log.exception('Error parsing publisher for url: %r'%self.url)
try:
mi.pubdate = self.parse_pubdate(pd)
except:
self.log.exception('Error parsing publish date for url: %r'%self.url)
try:
lang = self.parse_language(pd)
if lang:
mi.language = lang
except:
self.log.exception('Error parsing language for url: %r'%self.url)
else:
self.log.warning('Failed to find product description for url: %r'%self.url)
mi.source_relevance = self.relevance mi.source_relevance = self.relevance
@ -359,7 +375,13 @@ class Worker(Thread): # Get details {{{
for l in link: for l in link:
return l.get('href').rpartition('/')[-1] return l.get('href').rpartition('/')[-1]
def totext(self, elem):
return self.tostring(elem, encoding=unicode, method='text').strip()
def parse_title(self, root): def parse_title(self, root):
h1 = root.xpath('//h1[@id="title"]')
if h1:
return self.totext(h1[0])
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0] tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
if actual_title: if actual_title:
@ -373,6 +395,11 @@ class Worker(Thread): # Get details {{{
return ans return ans
def parse_authors(self, root): def parse_authors(self, root):
matches = CSSSelect('#byline .author .contributorNameID')(root)
if matches:
authors = [self.totext(x) for x in matches]
return [a for a in authors if a]
x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]' x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
aname = root.xpath(x) aname = root.xpath(x)
if not aname: if not aname:
@ -420,8 +447,8 @@ class Worker(Thread): # Get details {{{
# remove all attributes from tags # remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace # Collapse whitespace
#desc = re.sub('\n+', '\n', desc) # desc = re.sub('\n+', '\n', desc)
#desc = re.sub(' +', ' ', desc) # desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions # Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments # Remove comments
@ -429,6 +456,17 @@ class Worker(Thread): # Get details {{{
return sanitize_comments_html(desc) return sanitize_comments_html(desc)
def parse_comments(self, root): def parse_comments(self, root):
ns = CSSSelect('#bookDescription_feature_div noscript')(root)
if ns:
ns = ns[0]
if len(ns) == 0 and ns.text:
import html5lib
# html5lib parsed noscript as CDATA
ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
else:
ns.tag = 'div'
return self._render_comments(ns)
ans = '' ans = ''
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
if desc: if desc:
@ -472,6 +510,37 @@ class Worker(Thread): # Get details {{{
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1])) bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
return ('/'.join(parts[:-1]))+'/'+bn return ('/'.join(parts[:-1]))+'/'+bn
def parse_new_details(self, root, mi, non_hero):
table = non_hero.xpath('descendant::table')[0]
for tr in table.xpath('descendant::tr'):
cells = tr.xpath('descendant::td')
if len(cells) == 2:
name = self.totext(cells[0])
val = self.totext(cells[1])
if not val:
continue
if name in self.language_names:
ans = self.lang_map.get(val, None)
if not ans:
ans = canonicalize_lang(val)
if ans:
mi.language = ans
elif name in self.publisher_names:
pub = val.partition(';')[0].partition('(')[0].strip()
if pub:
mi.publisher = pub
date = val.rpartition('(')[-1].replace(')', '').strip()
try:
from calibre.utils.date import parse_only_date
date = self.delocalize_datestr(date)
mi.pubdate = parse_only_date(date, assume_utc=True)
except:
self.log.exception('Failed to parse pubdate: %s' % val)
elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}:
ans = check_isbn(val)
if ans:
self.isbn = mi.isbn = ans
def parse_isbn(self, pd): def parse_isbn(self, pd):
items = pd.xpath( items = pd.xpath(
'descendant::*[starts-with(text(), "ISBN")]') 'descendant::*[starts-with(text(), "ISBN")]')
@ -721,9 +790,9 @@ class Amazon(Source):
def title_ok(title): def title_ok(title):
title = title.lower() title = title.lower()
bad = ['bulk pack', '[audiobook]', '[audio cd]'] bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )']
if self.domain == 'com': if self.domain == 'com':
bad.append('(spanish edition)') bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
for x in bad: for x in bad:
if x in title: if x in title:
return False return False
@ -901,14 +970,9 @@ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e # To run these test use: calibre-debug -e
# src/calibre/ebooks/metadata/sources/amazon.py # src/calibre/ebooks/metadata/sources/amazon.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test, title_test, authors_test, comments_test, series_test) isbn_test, title_test, authors_test, comments_test)
com_tests = [ # {{{ com_tests = [ # {{{
( # Has a spanish edition
{'title':'11/22/63'},
[title_test('11/22/63: A Novel', exact=True), authors_test(['Stephen King']),]
),
( # + in title and uses id="main-image" for cover ( # + in title and uses id="main-image" for cover
{'title':'C++ Concurrency in Action'}, {'title':'C++ Concurrency in Action'},
[title_test('C++ Concurrency in Action: Practical Multithreading', [title_test('C++ Concurrency in Action: Practical Multithreading',
@ -916,11 +980,10 @@ if __name__ == '__main__': # tests {{{
] ]
), ),
( # Series ( # noscript description
{'identifiers':{'amazon':'0756407117'}}, {'identifiers':{'amazon':'0756407117'}},
[title_test( [title_test(
"Throne of the Crescent Moon", "Throne of the Crescent Moon"),
exact=True), series_test('Crescent Moon Kingdoms', 1),
comments_test('Makhslood'), comments_test('Makhslood'),
] ]
), ),
@ -1054,3 +1117,4 @@ if __name__ == '__main__': # tests {{{
# }}} # }}}

View File

@ -34,7 +34,7 @@ def astext(node):
return etree.tostring(node, method='text', encoding=unicode, return etree.tostring(node, method='text', encoding=unicode,
with_tail=False).strip() with_tail=False).strip()
class Worker(Thread): # {{{ class Worker(Thread): # {{{
def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin): def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin):
Thread.__init__(self) Thread.__init__(self)
@ -154,8 +154,8 @@ class Worker(Thread): # {{{
# remove all attributes from tags # remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace # Collapse whitespace
#desc = re.sub('\n+', '\n', desc) # desc = re.sub('\n+', '\n', desc)
#desc = re.sub(' +', ' ', desc) # desc = re.sub(' +', ' ', desc)
# Remove comments # Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc) desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc) return sanitize_comments_html(desc)
@ -183,14 +183,14 @@ class Edelweiss(Source):
if sku: if sku:
return 'http://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku return 'http://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku
def get_book_url(self, identifiers): # {{{ def get_book_url(self, identifiers): # {{{
sku = identifiers.get('edelweiss', None) sku = identifiers.get('edelweiss', None)
if sku: if sku:
return 'edelweiss', sku, self._get_book_url(sku) return 'edelweiss', sku, self._get_book_url(sku)
# }}} # }}}
def get_cached_cover_url(self, identifiers): # {{{ def get_cached_cover_url(self, identifiers): # {{{
sku = identifiers.get('edelweiss', None) sku = identifiers.get('edelweiss', None)
if not sku: if not sku:
isbn = identifiers.get('isbn', None) isbn = identifiers.get('isbn', None)
@ -199,7 +199,7 @@ class Edelweiss(Source):
return self.cached_identifier_to_cover_url(sku) return self.cached_identifier_to_cover_url(sku)
# }}} # }}}
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
from urllib import urlencode from urllib import urlencode
BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?' BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?'
params = { params = {
@ -239,9 +239,40 @@ class Edelweiss(Source):
params[k] = v.encode('utf-8') params[k] = v.encode('utf-8')
return BASE_URL+urlencode(params) return BASE_URL+urlencode(params)
def create_query2(self, log, title=None, authors=None, identifiers={}):
''' The edelweiss advanced search appears to be broken, use the keyword search instead, until it is fixed. '''
from urllib import urlencode
BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?'
params = {
'group':'search',
'section':'CatalogOverview',
'searchType':1,
'searchOrgID':'',
'searchCatalogID': '',
'searchMailingID': '',
'searchSelect':1,
}
keywords = []
isbn = check_isbn(identifiers.get('isbn', None))
if isbn is not None:
keywords.append(isbn)
elif title or authors:
title_tokens = list(self.get_title_tokens(title))
if title_tokens:
keywords.extend(title_tokens)
author_tokens = self.get_author_tokens(authors,
only_first_author=True)
if author_tokens:
keywords.extend(author_tokens)
if not keywords:
return None
params['keywords'] = (' '.join(keywords)).encode('utf-8')
return BASE_URL+urlencode(params)
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30): identifiers={}, timeout=30):
from urlparse import parse_qs from urlparse import parse_qs
@ -251,11 +282,12 @@ class Edelweiss(Source):
entries = [(book_url, identifiers['edelweiss'])] entries = [(book_url, identifiers['edelweiss'])]
else: else:
entries = [] entries = []
query = self.create_query(log, title=title, authors=authors, query = self.create_query2(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
if not query: if not query:
log.error('Insufficient metadata to construct query') log.error('Insufficient metadata to construct query')
return return
log('Using query URL:', query)
try: try:
raw = br.open_novisit(query, timeout=timeout).read() raw = br.open_novisit(query, timeout=timeout).read()
except Exception as e: except Exception as e:
@ -270,7 +302,8 @@ class Edelweiss(Source):
for entry in CSSSelect('div.listRow div.listRowMain')(root): for entry in CSSSelect('div.listRow div.listRowMain')(root):
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "ProductDetailPage.aspx")]') a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "ProductDetailPage.aspx")]')
if not a: continue if not a:
continue
href = a[0].get('href') href = a[0].get('href')
prefix, qs = href.partition('?')[0::2] prefix, qs = href.partition('?')[0::2]
sku = parse_qs(qs).get('sku', None) sku = parse_qs(qs).get('sku', None)
@ -288,7 +321,7 @@ class Edelweiss(Source):
div = CSSSelect('div.format.attGroup')(entry) div = CSSSelect('div.format.attGroup')(entry)
text = astext(div[0]).lower() text = astext(div[0]).lower()
if 'audio' in text or 'mp3' in text: # Audio-book, ignore if 'audio' in text or 'mp3' in text: # Audio-book, ignore
continue continue
entries.append((self._get_book_url(sku), sku)) entries.append((self._get_book_url(sku), sku))
@ -321,7 +354,7 @@ class Edelweiss(Source):
# }}} # }}}
def download_cover(self, log, result_queue, abort, # {{{ def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers) cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None: if cached_url is None:
@ -381,7 +414,7 @@ if __name__ == '__main__':
), ),
( # Pubdate ( # Pubdate
{'title':'The Great Gatsby', 'authors':['F. Scott Fitzgerald']}, {'title':'The Great Gatsby', 'authors':['F. Scott Fitzgerald']},
[title_test('The great gatsby', exact=True), [title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald']), pubdate_test(2004, 9, 29)] authors_test(['F. Scott Fitzgerald']), pubdate_test(2004, 9, 29)]
@ -395,3 +428,5 @@ if __name__ == '__main__':
test_identify_plugin(Edelweiss.name, tests) test_identify_plugin(Edelweiss.name, tests)

View File

@ -106,7 +106,6 @@ def single_identify(title, authors, identifiers):
r in results], dump_caches(), log.dump() r in results], dump_caches(), log.dump()
def single_covers(title, authors, identifiers, caches, tdir): def single_covers(title, authors, identifiers, caches, tdir):
os.chdir(tdir)
load_caches(caches) load_caches(caches)
log = GUILog() log = GUILog()
results = Queue() results = Queue()
@ -126,9 +125,9 @@ def single_covers(title, authors, identifiers, caches, tdir):
name += '{%d}'%c[plugin.name] name += '{%d}'%c[plugin.name]
c[plugin.name] += 1 c[plugin.name] += 1
name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt) name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt)
with open(name, 'wb') as f: with open(os.path.join(tdir, name), 'wb') as f:
f.write(data) f.write(data)
os.mkdir(name+'.done') os.mkdir(os.path.join(tdir, name+'.done'))
return log.dump() return log.dump()

View File

@ -110,6 +110,19 @@ class AddAction(InterfaceAction):
return return
db = view.model().db db = view.model().db
if len(ids) == 1:
formats = db.formats(ids[0], index_is_id=True)
if formats:
formats = {x.upper() for x in formats.split(',')}
nformats = {f.rpartition('.')[-1].upper() for f in books}
override = formats.intersection(nformats)
if override:
title = db.title(ids[0], index_is_id=True)
msg = _('The {0} format(s) will be replaced in the book {1}. Are you sure?').format(
', '.join(override), title)
if not confirm(msg, 'confirm_format_override_on_add', title=_('Are you sure'), parent=self.gui):
return
for id_ in ids: for id_ in ids:
for fpath in books: for fpath in books:
fmt = os.path.splitext(fpath)[1][1:].upper() fmt = os.path.splitext(fpath)[1][1:].upper()

View File

@ -9,8 +9,7 @@ import shutil
from PyQt4.Qt import QString, SIGNAL from PyQt4.Qt import QString, SIGNAL
from calibre.gui2.convert.single import (Config, sort_formats_by_preference, from calibre.gui2.convert.single import (Config, sort_formats_by_preference,
GroupModel, gprefs) GroupModel, gprefs, get_output_formats)
from calibre.customize.ui import available_output_formats
from calibre.gui2 import ResizableDialog from calibre.gui2 import ResizableDialog
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
from calibre.gui2.convert.heuristics import HeuristicsWidget from calibre.gui2.convert.heuristics import HeuristicsWidget
@ -43,7 +42,6 @@ class BulkConfig(Config):
'values saved in a previous conversion (if they exist) instead ' 'values saved in a previous conversion (if they exist) instead '
'of using the defaults specified in the Preferences')) 'of using the defaults specified in the Preferences'))
self.connect(self.output_formats, SIGNAL('currentIndexChanged(QString)'), self.connect(self.output_formats, SIGNAL('currentIndexChanged(QString)'),
self.setup_pipeline) self.setup_pipeline)
self.connect(self.groups, SIGNAL('activated(QModelIndex)'), self.connect(self.groups, SIGNAL('activated(QModelIndex)'),
@ -96,7 +94,8 @@ class BulkConfig(Config):
while True: while True:
c = self.stack.currentWidget() c = self.stack.currentWidget()
if not c: break if not c:
break
self.stack.removeWidget(c) self.stack.removeWidget(c)
widgets = [lf, hw, ps, sd, toc, sr] widgets = [lf, hw, ps, sd, toc, sr]
@ -118,17 +117,14 @@ class BulkConfig(Config):
except: except:
pass pass
def setup_output_formats(self, db, preferred_output_format): def setup_output_formats(self, db, preferred_output_format):
if preferred_output_format: if preferred_output_format:
preferred_output_format = preferred_output_format.lower() preferred_output_format = preferred_output_format.lower()
output_formats = sorted(available_output_formats(), output_formats = get_output_formats(preferred_output_format)
key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x))
output_formats.remove('oeb')
preferred_output_format = preferred_output_format if \ preferred_output_format = preferred_output_format if \
preferred_output_format and preferred_output_format \ preferred_output_format and preferred_output_format \
in output_formats else sort_formats_by_preference(output_formats, in output_formats else sort_formats_by_preference(output_formats,
prefs['output_format'])[0] [prefs['output_format']])[0]
self.output_formats.addItems(list(map(QString, [x.upper() for x in self.output_formats.addItems(list(map(QString, [x.upper() for x in
output_formats]))) output_formats])))
self.output_formats.setCurrentIndex(output_formats.index(preferred_output_format)) self.output_formats.setCurrentIndex(output_formats.index(preferred_output_format))
@ -149,3 +145,4 @@ class BulkConfig(Config):
bytearray(self.saveGeometry()) bytearray(self.saveGeometry())
return ResizableDialog.done(self, r) return ResizableDialog.done(self, r)

View File

@ -29,7 +29,7 @@ from calibre.ebooks.conversion.plumber import (Plumber,
from calibre.ebooks.conversion.config import delete_specifics from calibre.ebooks.conversion.config import delete_specifics
from calibre.customize.ui import available_output_formats from calibre.customize.ui import available_output_formats
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre.utils.config import prefs from calibre.utils.config import prefs, tweaks
from calibre.utils.logging import Log from calibre.utils.logging import Log
class NoSupportedInputFormats(Exception): class NoSupportedInputFormats(Exception):
@ -48,6 +48,20 @@ def sort_formats_by_preference(formats, prefs):
return len(prefs) return len(prefs)
return sorted(formats, key=key) return sorted(formats, key=key)
def get_output_formats(preferred_output_format):
all_formats = {x.upper() for x in available_output_formats()}
all_formats.discard('OEB')
pfo = preferred_output_format.upper() if preferred_output_format else ''
restrict = tweaks['restrict_output_formats']
if restrict:
fmts = [x.upper() for x in restrict]
if pfo and pfo not in fmts and pfo in all_formats:
fmts.append(pfo)
else:
fmts = list(sorted(all_formats,
key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x)))
return fmts
class GroupModel(QAbstractListModel): class GroupModel(QAbstractListModel):
def __init__(self, widgets): def __init__(self, widgets):
@ -239,15 +253,13 @@ class Config(ResizableDialog, Ui_Dialog):
preferred_output_format): preferred_output_format):
if preferred_output_format: if preferred_output_format:
preferred_output_format = preferred_output_format.lower() preferred_output_format = preferred_output_format.lower()
output_formats = sorted(available_output_formats(), output_formats = get_output_formats(preferred_output_format)
key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x))
output_formats.remove('oeb')
input_format, input_formats = get_input_format_for_book(db, book_id, input_format, input_formats = get_input_format_for_book(db, book_id,
preferred_input_format) preferred_input_format)
preferred_output_format = preferred_output_format if \ preferred_output_format = preferred_output_format if \
preferred_output_format in output_formats else \ preferred_output_format in output_formats else \
sort_formats_by_preference(output_formats, sort_formats_by_preference(output_formats,
prefs['output_format'])[0] [prefs['output_format']])[0]
self.input_formats.addItems(list(map(QString, [x.upper() for x in self.input_formats.addItems(list(map(QString, [x.upper() for x in
input_formats]))) input_formats])))
self.output_formats.addItems(list(map(QString, [x.upper() for x in self.output_formats.addItems(list(map(QString, [x.upper() for x in

View File

@ -34,6 +34,19 @@ def app_prefix(prefix):
return '%s_'%__appname__ return '%s_'%__appname__
return '%s_%s_%s'%(__appname__, __version__, prefix) return '%s_%s_%s'%(__appname__, __version__, prefix)
def reset_temp_folder_permissions():
# There are some broken windows installs where the permissions for the temp
# folder are set to not be executable, which means chdir() into temp
# folders fails. Try to fix that by resetting the permissions on the temp
# folder.
global _base_dir
if iswindows and _base_dir:
import subprocess
from calibre import prints
parent = os.path.dirname(_base_dir)
retcode = subprocess.Popen(['icacls.exe', parent, '/reset', '/Q', '/T']).wait()
prints('Trying to reset permissions of temp folder', parent, 'return code:', retcode)
def base_dir(): def base_dir():
global _base_dir global _base_dir
if _base_dir is not None and not os.path.exists(_base_dir): if _base_dir is not None and not os.path.exists(_base_dir):

View File

@ -145,8 +145,11 @@ def download_resources(browser, resource_cache, output_dir):
elem.removeFromDocument() elem.removeFromDocument()
def save_html(browser, output_dir, postprocess_html, url, recursion_level): def save_html(browser, output_dir, postprocess_html, url, recursion_level):
html = strip_encoding_declarations(browser.html)
import html5lib import html5lib
from calibre.utils.cleantext import clean_xml_chars
html = strip_encoding_declarations(browser.html)
if isinstance(html, unicode):
html = clean_xml_chars(html)
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot() root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
root = postprocess_html(root, url, recursion_level) root = postprocess_html(root, url, recursion_level)
if root is None: if root is None:

View File

@ -571,7 +571,7 @@ class Browser(QObject, FormsMixin):
ans[url] = raw ans[url] = raw
urls.discard(url) urls.discard(url)
while urls and time.time() - start_time > timeout and self.page.ready_state not in {'complete', 'completed'}: while urls and time.time() - start_time < timeout and self.page.ready_state not in {'complete', 'completed'}:
get_resources() get_resources()
if urls: if urls:
self.run_for_a_time(0.1) self.run_for_a_time(0.1)