mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Merge branch 'kovidgoyal/master'
This commit is contained in:
commit
a04c4159f7
@ -772,9 +772,11 @@ size. By default, |app| uses a page size defined by the current
|
|||||||
:guilabel:`Output profile`. So if your output profile is set to Kindle, |app|
|
:guilabel:`Output profile`. So if your output profile is set to Kindle, |app|
|
||||||
will create a PDF with page size suitable for viewing on the small kindle
|
will create a PDF with page size suitable for viewing on the small kindle
|
||||||
screen. However, if you view this PDF file on a computer screen, then it will
|
screen. However, if you view this PDF file on a computer screen, then it will
|
||||||
appear to have too large fonts. To create "normal" sized PDFs, use the override
|
appear to have too large fonts. To create "normal" sized PDFs, use the
|
||||||
page size option under :guilabel:`PDF Output` in the conversion dialog.
|
:guilabel:`Override page size` option under :guilabel:`PDF Output` in the conversion dialog.
|
||||||
|
|
||||||
|
Headers and Footers
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
You can insert arbitrary headers and footers on each page of the PDF by
|
You can insert arbitrary headers and footers on each page of the PDF by
|
||||||
specifying header and footer templates. Templates are just snippets of HTML
|
specifying header and footer templates. Templates are just snippets of HTML
|
||||||
code that get rendered in the header and footer locations. For example, to
|
code that get rendered in the header and footer locations. For example, to
|
||||||
@ -813,6 +815,9 @@ the page will be used.
|
|||||||
bottom margins to large enough values, under the Page Setup section of the
|
bottom margins to large enough values, under the Page Setup section of the
|
||||||
conversion dialog.
|
conversion dialog.
|
||||||
|
|
||||||
|
Printable Table of Contents
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
You can also insert a printable Table of Contents at the end of the PDF that
|
You can also insert a printable Table of Contents at the end of the PDF that
|
||||||
lists the page numbers for every section. This is very useful if you intend to
|
lists the page numbers for every section. This is very useful if you intend to
|
||||||
print out the PDF to paper. If you wish to use the PDF on an electronic device,
|
print out the PDF to paper. If you wish to use the PDF on an electronic device,
|
||||||
|
@ -776,6 +776,31 @@ The only way to find the culprit is to eliminate the programs one by one and
|
|||||||
see which one is causing the issue. Basically, stop a program, run calibre,
|
see which one is causing the issue. Basically, stop a program, run calibre,
|
||||||
check for crashes. If they still happen, stop another program and repeat.
|
check for crashes. If they still happen, stop another program and repeat.
|
||||||
|
|
||||||
|
|
||||||
|
Using the viewer or doing any conversions results in a permission denied error on windows
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Something on your computer is preventing calibre from accessing its own
|
||||||
|
temporary files. Most likely the permissions on your Temp folder are incorrect.
|
||||||
|
Go to the folder file:`C:\\Users\\USERNAME\\AppData\\Local` in Windows
|
||||||
|
Explorer and then right click on the file:`Temp` folder, select Properties and go to
|
||||||
|
the Security tab. Make sure that your user account has full control for this
|
||||||
|
folder.
|
||||||
|
|
||||||
|
Some users have reported that running the following command in an Administrator
|
||||||
|
Command Prompt fixed their permissions. To get an Administrator Command Prompt
|
||||||
|
search for cmd.exe in the start menu, then right click on the command prompt
|
||||||
|
entry and select Run as Administrator. At the command prompt type the following
|
||||||
|
command and press Enter::
|
||||||
|
|
||||||
|
icacls "%appdata%\..\Local\Temp" /reset /T
|
||||||
|
|
||||||
|
Alternately, you can run calibre as Administrator, but doing so will cause
|
||||||
|
some functionality, such as drag and drop to not work.
|
||||||
|
|
||||||
|
Finally, some users have reported that disabling UAC fixes the problem.
|
||||||
|
|
||||||
|
|
||||||
|app| is not starting on OS X?
|
|app| is not starting on OS X?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
@ -1,63 +1,55 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Cracked(BasicNewsRecipe):
|
|
||||||
title = u'Cracked.com'
|
|
||||||
__author__ = 'UnWeave'
|
|
||||||
language = 'en'
|
|
||||||
description = "America's Only HumorSite since 1958"
|
|
||||||
publisher = 'Cracked'
|
|
||||||
category = 'comedy, lists'
|
|
||||||
oldest_article = 3 #days
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
encoding = 'ascii'
|
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ]
|
class Cracked(BasicNewsRecipe):
|
||||||
|
title = u'Cracked.com'
|
||||||
|
__author__ = 'UnWeave'
|
||||||
|
language = 'en'
|
||||||
|
description = "America's Only HumorSite since 1958"
|
||||||
|
publisher = 'Cracked'
|
||||||
|
category = 'comedy, lists'
|
||||||
|
oldest_article = 3 # days
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'ascii'
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
# auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||||
, 'tags' : category
|
}
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : language
|
|
||||||
}
|
|
||||||
|
|
||||||
remove_tags_before = dict(id='PrimaryContent')
|
# remove_tags_before = dict(id='PrimaryContent')
|
||||||
|
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'shareBar'})
|
keep_only_tags = dict(name='article', attrs={
|
||||||
|
'class': 'module article dropShadowBottomCurved'})
|
||||||
|
|
||||||
remove_tags = [ dict(name='div', attrs={'class':['social',
|
# remove_tags_after = dict(name='div', attrs={'class':'shareBar'})
|
||||||
'FacebookLike',
|
|
||||||
'shareBar'
|
|
||||||
]}),
|
|
||||||
|
|
||||||
dict(name='div', attrs={'id':['inline-share-buttons',
|
remove_tags = [
|
||||||
]}),
|
dict(name='section', attrs={'class': ['socialTools', 'quickFixModule']})]
|
||||||
|
|
||||||
dict(name='span', attrs={'class':['views',
|
|
||||||
'KonaFilter'
|
|
||||||
]}),
|
|
||||||
#dict(name='img'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def appendPage(self, soup, appendTag, position):
|
def appendPage(self, soup, appendTag, position):
|
||||||
# Check if article has multiple pages
|
# Check if article has multiple pages
|
||||||
pageNav = soup.find('nav', attrs={'class':'PaginationContent'})
|
pageNav = soup.find('nav', attrs={'class': 'PaginationContent'})
|
||||||
if pageNav:
|
if pageNav:
|
||||||
# Check not at last page
|
# Check not at last page
|
||||||
nextPage = pageNav.find('a', attrs={'class':'next'})
|
nextPage = pageNav.find('a', attrs={'class': 'next'})
|
||||||
if nextPage:
|
if nextPage:
|
||||||
nextPageURL = nextPage['href']
|
nextPageURL = nextPage['href']
|
||||||
nextPageSoup = self.index_to_soup(nextPageURL)
|
nextPageSoup = self.index_to_soup(nextPageURL)
|
||||||
# 8th <section> tag contains article content
|
# 8th <section> tag contains article content
|
||||||
nextPageContent = nextPageSoup.findAll('section')[7]
|
nextPageContent = nextPageSoup.findAll('article')[0]
|
||||||
newPosition = len(nextPageContent.contents)
|
newPosition = len(nextPageContent.contents)
|
||||||
self.appendPage(nextPageSoup,nextPageContent,newPosition)
|
self.appendPage(nextPageSoup, nextPageContent, newPosition)
|
||||||
nextPageContent.extract()
|
nextPageContent.extract()
|
||||||
pageNav.extract()
|
pageNav.extract()
|
||||||
appendTag.insert(position,nextPageContent)
|
appendTag.insert(position, nextPageContent)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.appendPage(soup, soup.body, 3)
|
self.appendPage(soup, soup.body, 3)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -1,41 +1,206 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# -*- coding: utf-8 -*-
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
|
||||||
|
'''
|
||||||
|
chron.com
|
||||||
|
'''
|
||||||
|
import re, time
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.utils.date import dt_factory, local_tz
|
||||||
|
from datetime import datetime, timedelta, date
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
class HoustonChronicle(BasicNewsRecipe):
|
class HoustonChronicle(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'The Houston Chronicle'
|
title = u'The Houston Chronicle'
|
||||||
description = 'News from Houston, Texas'
|
description = 'News from Houston, Texas'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Dale Furrow'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
# use_embedded_content = False
|
||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
auto_cleanup = True
|
remove_empty_feeds = True
|
||||||
|
timefmt = '[%a, %d %b %Y]'
|
||||||
oldest_article = 3.0
|
timestampfmt = '%Y%m%d%H%M%S'
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
#keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or
|
|
||||||
#'hst-articletext' in x or 'hst-galleryitem' in x)}
|
|
||||||
remove_attributes = ['xmlns']
|
remove_attributes = ['xmlns']
|
||||||
|
|
||||||
feeds = [
|
remove_tags = [dict(name='div', attrs={'class':'socialBar'}),
|
||||||
('News', "http://www.chron.com/rss/feed/News-270.php"),
|
dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
|
||||||
('Sports',
|
dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}),
|
||||||
'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'),
|
dict(name='div', attrs={'class':'entry-summary'}),
|
||||||
('Neighborhood',
|
dict(name='a', attrs={'rel':'item-license'})]
|
||||||
'http://www.chron.com/rss/feed/Neighborhood-305.php'),
|
|
||||||
('Business', 'http://www.chron.com/rss/feed/Business-287.php'),
|
baseUrl = 'http://www.chron.com'
|
||||||
('Entertainment',
|
|
||||||
'http://www.chron.com/rss/feed/Entertainment-293.php'),
|
oldest_web_article = 7.0
|
||||||
('Editorials',
|
|
||||||
'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'),
|
if oldest_web_article is None:
|
||||||
('Life', 'http://www.chron.com/rss/feed/Life-297.php'),
|
earliest_date = date.today()
|
||||||
('Science & Tech',
|
else:
|
||||||
'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'),
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||||
]
|
|
||||||
|
pages = [('news' , '/news/houston-texas/'),
|
||||||
|
('business' , '/business/'),
|
||||||
|
('opinion', '/opinion/'),
|
||||||
|
('sports', '/sports/')]
|
||||||
|
|
||||||
|
def getLinksFromSectionPage(self, sectionUrl):
|
||||||
|
pageDoc = html.parse(sectionUrl)
|
||||||
|
els = pageDoc.xpath("""//div[contains(@class, 'scp-item')
|
||||||
|
or @class='scp-feature' or contains(@class, 'simplelist')
|
||||||
|
or contains(@class, 'scp-blogpromo')]
|
||||||
|
//a[@href and not(@target) and not(child::img)]""")
|
||||||
|
elList = []
|
||||||
|
for el in els:
|
||||||
|
link = el.get('href')
|
||||||
|
title = el.text
|
||||||
|
if link[:4] != 'http':
|
||||||
|
link = self.baseUrl + link
|
||||||
|
if title is not None:
|
||||||
|
elList.append((link, el.text))
|
||||||
|
return elList
|
||||||
|
|
||||||
|
def getArticleDescriptionFromDoc(self, pageDoc):
|
||||||
|
descriptionCharsBreak = 140
|
||||||
|
descriptionMaxChars = 300
|
||||||
|
descXpath = """//div[contains(@class, 'article-body') or
|
||||||
|
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
|
||||||
|
sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
||||||
|
|
||||||
|
def stringify_children(node):
|
||||||
|
return ''.join([x for x in node.itertext()])
|
||||||
|
try:
|
||||||
|
els = pageDoc.xpath(descXpath)
|
||||||
|
outText = ""
|
||||||
|
ellipsis = ""
|
||||||
|
for el in els:
|
||||||
|
sentences = re.findall(sentenceRegex, stringify_children(el))
|
||||||
|
for sentence in sentences:
|
||||||
|
if len(outText) < descriptionCharsBreak:
|
||||||
|
outText += sentence + " "
|
||||||
|
else:
|
||||||
|
if len(outText) > descriptionMaxChars:
|
||||||
|
ellipsis = "..."
|
||||||
|
return outText[:descriptionMaxChars] + ellipsis
|
||||||
|
return outText
|
||||||
|
except:
|
||||||
|
self.log('Error on Article Description')
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def getPublishedTimeFromDoc(self, pageDoc):
|
||||||
|
regexDateOnly = re.compile("""(?:January|February|March|April|
|
||||||
|
May|June|July|August|September|October|November|
|
||||||
|
December)\s[0-9]{1,2},\s20[01][0-9]""")
|
||||||
|
regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
||||||
|
def getRegularTimestamp(dateString):
|
||||||
|
try:
|
||||||
|
outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
return outDate
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
def getDateFromString(inText):
|
||||||
|
match = re.findall(regexDateOnly, inText)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
outDate = datetime.strptime(match[0], "%B %d, %Y")
|
||||||
|
match = re.findall(regextTimeOnly, inText)
|
||||||
|
if match:
|
||||||
|
outTime = datetime.strptime(match[0], "%I:%M %p")
|
||||||
|
return datetime.combine(outDate.date(), outTime.time())
|
||||||
|
return outDate
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
el = pageDoc.xpath("//*[@class='timestamp'][1]")
|
||||||
|
if len(el) == 1:
|
||||||
|
return getRegularTimestamp(el[0].get('title'))
|
||||||
|
else:
|
||||||
|
el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]")
|
||||||
|
if len(el) == 1:
|
||||||
|
return getDateFromString(el[0].text_content())
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getAllFeedDataFromPage(self, page):
|
||||||
|
articles = []
|
||||||
|
linkList = self.getLinksFromSectionPage(self.baseUrl + page[1])
|
||||||
|
self.log('from section: ', page[0], " found ", len(linkList), " links")
|
||||||
|
for link in linkList:
|
||||||
|
try:
|
||||||
|
articleDoc = html.parse(link[0])
|
||||||
|
description = self.getArticleDescriptionFromDoc(articleDoc)
|
||||||
|
articleDate = self.getPublishedTimeFromDoc(articleDoc)
|
||||||
|
if articleDate is not None and description is not None and articleDate.date() > self.earliest_date:
|
||||||
|
dateText = articleDate.strftime('%a, %d %b')
|
||||||
|
author = articleDate.strftime(self.timestampfmt)
|
||||||
|
articles.append({'title':link[1], 'url':link[0],
|
||||||
|
'description':description, 'date':dateText, 'author':author})
|
||||||
|
self.log(page[0] + ": " + link[1] + ', from ' + dateText +
|
||||||
|
" description of " + str(len(description)) + ' characters at ' + link[0])
|
||||||
|
else:
|
||||||
|
msg = ""
|
||||||
|
if articleDate is None:
|
||||||
|
msg = " No Timestamp Found"
|
||||||
|
else:
|
||||||
|
msg = " article older than " + str(self.oldest_web_article) + ' days...'
|
||||||
|
self.log("Skipping article: ", link[0], msg)
|
||||||
|
except:
|
||||||
|
print 'error on fetching ' + link[0]
|
||||||
|
continue
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
|
||||||
|
self.timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
||||||
|
feeds = []
|
||||||
|
for page in self.pages:
|
||||||
|
articles = []
|
||||||
|
articles = self.getAllFeedDataFromPage(page)
|
||||||
|
if articles:
|
||||||
|
feeds.append((page[0], articles))
|
||||||
|
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def preprocess_html(self, thisSoup):
|
||||||
|
baseTags = []
|
||||||
|
baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')}))
|
||||||
|
baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'}))
|
||||||
|
allTags = []
|
||||||
|
allTags.extend(baseTags)
|
||||||
|
if len(baseTags) > 0:
|
||||||
|
for tag in baseTags:
|
||||||
|
allTags.extend(tag.findAll(True))
|
||||||
|
paragraphs = thisSoup.findAll(name='p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if paragraph not in allTags:
|
||||||
|
allTags.append(paragraph)
|
||||||
|
for tag in baseTags:
|
||||||
|
while tag.parent is not None:
|
||||||
|
allTags.append(tag)
|
||||||
|
tag = tag.parent
|
||||||
|
for tag in thisSoup.findAll(True):
|
||||||
|
if tag not in allTags:
|
||||||
|
tag.extract()
|
||||||
|
return thisSoup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if not first:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
article.date = time.strptime(article.author, self.timestampfmt)
|
||||||
|
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
|
||||||
|
article.localtime = article.utctime.astimezone(local_tz)
|
||||||
|
except Exception as inst: # remove after debug
|
||||||
|
self.log('Exception: ', article.title) # remove after debug
|
||||||
|
self.log(type(inst)) # remove after debug
|
||||||
|
self.log(inst) # remove after debug
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,17 +1,18 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf.fr>'
|
||||||
'''
|
'''
|
||||||
Mediapart
|
Mediapart
|
||||||
'''
|
'''
|
||||||
|
|
||||||
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf.fr>'
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Mediapart(BasicNewsRecipe):
|
class Mediapart(BasicNewsRecipe):
|
||||||
title = 'Mediapart'
|
title = 'Mediapart'
|
||||||
__author__ = 'Mathieu Godlewski, Louis Gesbert'
|
__author__ = 'Mathieu Godlewski, Louis Gesbert, Malah'
|
||||||
description = 'Global news in french from news site Mediapart'
|
description = 'Global news in french from news site Mediapart'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
language = 'fr'
|
language = 'fr'
|
||||||
@ -21,6 +22,7 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
|
masthead_url = 'https://upload.wikimedia.org/wikipedia/fr/2/23/Mediapart.png'
|
||||||
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
|
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -36,18 +38,18 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
raw = self.browser.open(url).read()
|
raw = self.browser.open(url).read()
|
||||||
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||||
link = soup.find('a', {'title':'Imprimer'})
|
link = soup.find('a', {'href':re.compile('^/print/[0-9]+')})
|
||||||
if link is None:
|
if link is None:
|
||||||
return None
|
return None
|
||||||
return link['href']
|
return 'http://www.mediapart.fr' + link['href']
|
||||||
|
|
||||||
# -- Handle login
|
# -- Handle login
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
br.open('http://www.mediapart.fr/')
|
br.open('http://blogs.mediapart.fr/editions/guide-du-coordonnateur-d-edition')
|
||||||
br.select_form(nr=0)
|
br.select_form(nr=1)
|
||||||
br['name'] = self.username
|
br['name'] = self.username
|
||||||
br['pass'] = self.password
|
br['pass'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
|
@ -537,3 +537,10 @@ many_libraries = 10
|
|||||||
# highlight with this tweak. Set it to 'transparent' to disable highlighting.
|
# highlight with this tweak. Set it to 'transparent' to disable highlighting.
|
||||||
highlight_virtual_library = 'yellow'
|
highlight_virtual_library = 'yellow'
|
||||||
|
|
||||||
|
#: Choose available output formats for conversion
|
||||||
|
# Restrict the list of available output formats in the conversion dialogs.
|
||||||
|
# For example, if you only want to convert to EPUB and AZW3, change this to
|
||||||
|
# restrict_output_formats = ['EPUB', 'AZW3']. The default value of None causes
|
||||||
|
# all available output formats to be present.
|
||||||
|
restrict_output_formats = None
|
||||||
|
|
||||||
|
@ -436,13 +436,21 @@ def fit_image(width, height, pwidth, pheight):
|
|||||||
|
|
||||||
class CurrentDir(object):
|
class CurrentDir(object):
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path, workaround_temp_folder_permissions=False):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.cwd = None
|
self.cwd = None
|
||||||
|
self.workaround_temp_folder_permissions = workaround_temp_folder_permissions
|
||||||
|
|
||||||
def __enter__(self, *args):
|
def __enter__(self, *args):
|
||||||
self.cwd = os.getcwdu()
|
self.cwd = os.getcwdu()
|
||||||
os.chdir(self.path)
|
try:
|
||||||
|
os.chdir(self.path)
|
||||||
|
except OSError:
|
||||||
|
if not self.workaround_temp_folder_permissions:
|
||||||
|
raise
|
||||||
|
from calibre.ptempfile import reset_temp_folder_permissions
|
||||||
|
reset_temp_folder_permissions()
|
||||||
|
os.chdir(self.path)
|
||||||
return self.cwd
|
return self.cwd
|
||||||
|
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
|
@ -282,3 +282,8 @@ def get_windows_user_locale_name():
|
|||||||
return None
|
return None
|
||||||
return u'_'.join(buf.value.split(u'-')[:2])
|
return u'_'.join(buf.value.split(u'-')[:2])
|
||||||
|
|
||||||
|
def is_modern_webkit():
|
||||||
|
# Check if we are using QtWebKit >= 2.3
|
||||||
|
from PyQt4.QtWebKit import qWebKitMajorVersion
|
||||||
|
return qWebKitMajorVersion() >= 537
|
||||||
|
|
||||||
|
@ -233,7 +233,7 @@ class InputFormatPlugin(Plugin):
|
|||||||
# In case stdout is broken
|
# In case stdout is broken
|
||||||
pass
|
pass
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir, workaround_temp_folder_permissions=True):
|
||||||
for x in os.listdir('.'):
|
for x in os.listdir('.'):
|
||||||
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ class ANDROID(USBMS):
|
|||||||
0x0ff9 : [0x0226],
|
0x0ff9 : [0x0226],
|
||||||
0xc91 : HTC_BCDS,
|
0xc91 : HTC_BCDS,
|
||||||
0xdddd : [0x216],
|
0xdddd : [0x216],
|
||||||
0xdeed : [0x231],
|
0xdeed : [0x231, 0x226],
|
||||||
},
|
},
|
||||||
|
|
||||||
# Samsung
|
# Samsung
|
||||||
@ -241,7 +241,7 @@ class ANDROID(USBMS):
|
|||||||
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
|
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
|
||||||
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS',
|
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS',
|
||||||
'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894', '_USB',
|
'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894', '_USB',
|
||||||
'PROD_TAB13-201', 'URFPAD2',
|
'PROD_TAB13-201', 'URFPAD2', 'MID1126',
|
||||||
]
|
]
|
||||||
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
||||||
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
||||||
@ -254,7 +254,7 @@ class ANDROID(USBMS):
|
|||||||
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
|
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
|
||||||
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
|
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
|
||||||
'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894',
|
'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894',
|
||||||
'_USB', 'PROD_TAB13-201', 'URFPAD2'
|
'_USB', 'PROD_TAB13-201', 'URFPAD2', 'MID1126',
|
||||||
]
|
]
|
||||||
|
|
||||||
OSX_MAIN_MEM = 'Android Device Main Memory'
|
OSX_MAIN_MEM = 'Android Device Main Memory'
|
||||||
|
@ -283,11 +283,17 @@ class CollectionsBookList(BookList):
|
|||||||
return -1
|
return -1
|
||||||
if isinstance(x, basestring) and isinstance(y, basestring):
|
if isinstance(x, basestring) and isinstance(y, basestring):
|
||||||
x, y = sort_key(force_unicode(x)), sort_key(force_unicode(y))
|
x, y = sort_key(force_unicode(x)), sort_key(force_unicode(y))
|
||||||
c = cmp(x, y)
|
try:
|
||||||
|
c = cmp(x, y)
|
||||||
|
except TypeError:
|
||||||
|
c = 0
|
||||||
if c != 0:
|
if c != 0:
|
||||||
return c
|
return c
|
||||||
# same as above -- no sort_key needed here
|
# same as above -- no sort_key needed here
|
||||||
return cmp(xx[2], yy[2])
|
try:
|
||||||
|
return cmp(xx[2], yy[2])
|
||||||
|
except TypeError:
|
||||||
|
return 0
|
||||||
|
|
||||||
for category, lpaths in collections.items():
|
for category, lpaths in collections.items():
|
||||||
books = lpaths.values()
|
books = lpaths.values()
|
||||||
|
@ -19,6 +19,11 @@ from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
|
|||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
|
def CSSSelect(expr):
|
||||||
|
from cssselect import HTMLTranslator
|
||||||
|
from lxml.etree import XPath
|
||||||
|
return XPath(HTMLTranslator().css_to_xpath(expr))
|
||||||
|
|
||||||
class Worker(Thread): # Get details {{{
|
class Worker(Thread): # Get details {{{
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -142,6 +147,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
starts-with(text(), "Editora:") or \
|
starts-with(text(), "Editora:") or \
|
||||||
starts-with(text(), "出版社:")]
|
starts-with(text(), "出版社:")]
|
||||||
'''
|
'''
|
||||||
|
self.publisher_names = {'Publisher', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
|
||||||
|
|
||||||
self.language_xpath = '''
|
self.language_xpath = '''
|
||||||
descendant::*[
|
descendant::*[
|
||||||
starts-with(text(), "Language:") \
|
starts-with(text(), "Language:") \
|
||||||
@ -153,6 +160,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
or starts-with(text(), "言語") \
|
or starts-with(text(), "言語") \
|
||||||
]
|
]
|
||||||
'''
|
'''
|
||||||
|
self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語'}
|
||||||
|
|
||||||
self.ratings_pat = re.compile(
|
self.ratings_pat = re.compile(
|
||||||
r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de|de) ([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas)){0,1}')
|
r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de|de) ([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas)){0,1}')
|
||||||
@ -310,36 +318,44 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.exception('Error parsing cover for url: %r'%self.url)
|
self.log.exception('Error parsing cover for url: %r'%self.url)
|
||||||
mi.has_cover = bool(self.cover_url)
|
mi.has_cover = bool(self.cover_url)
|
||||||
|
|
||||||
pd = root.xpath(self.pd_xpath)
|
non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
|
||||||
if pd:
|
if non_hero:
|
||||||
pd = pd[0]
|
# New style markup
|
||||||
|
|
||||||
try:
|
try:
|
||||||
isbn = self.parse_isbn(pd)
|
self.parse_new_details(root, mi, non_hero[0])
|
||||||
if isbn:
|
|
||||||
self.isbn = mi.isbn = isbn
|
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing ISBN for url: %r'%self.url)
|
self.log.exception('Failed to parse new-style book details section')
|
||||||
|
|
||||||
try:
|
|
||||||
mi.publisher = self.parse_publisher(pd)
|
|
||||||
except:
|
|
||||||
self.log.exception('Error parsing publisher for url: %r'%self.url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
mi.pubdate = self.parse_pubdate(pd)
|
|
||||||
except:
|
|
||||||
self.log.exception('Error parsing publish date for url: %r'%self.url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
lang = self.parse_language(pd)
|
|
||||||
if lang:
|
|
||||||
mi.language = lang
|
|
||||||
except:
|
|
||||||
self.log.exception('Error parsing language for url: %r'%self.url)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.log.warning('Failed to find product description for url: %r'%self.url)
|
pd = root.xpath(self.pd_xpath)
|
||||||
|
if pd:
|
||||||
|
pd = pd[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
isbn = self.parse_isbn(pd)
|
||||||
|
if isbn:
|
||||||
|
self.isbn = mi.isbn = isbn
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing ISBN for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mi.publisher = self.parse_publisher(pd)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing publisher for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mi.pubdate = self.parse_pubdate(pd)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing publish date for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
lang = self.parse_language(pd)
|
||||||
|
if lang:
|
||||||
|
mi.language = lang
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing language for url: %r'%self.url)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.log.warning('Failed to find product description for url: %r'%self.url)
|
||||||
|
|
||||||
mi.source_relevance = self.relevance
|
mi.source_relevance = self.relevance
|
||||||
|
|
||||||
@ -359,7 +375,13 @@ class Worker(Thread): # Get details {{{
|
|||||||
for l in link:
|
for l in link:
|
||||||
return l.get('href').rpartition('/')[-1]
|
return l.get('href').rpartition('/')[-1]
|
||||||
|
|
||||||
|
def totext(self, elem):
|
||||||
|
return self.tostring(elem, encoding=unicode, method='text').strip()
|
||||||
|
|
||||||
def parse_title(self, root):
|
def parse_title(self, root):
|
||||||
|
h1 = root.xpath('//h1[@id="title"]')
|
||||||
|
if h1:
|
||||||
|
return self.totext(h1[0])
|
||||||
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
|
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
|
||||||
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
||||||
if actual_title:
|
if actual_title:
|
||||||
@ -373,6 +395,11 @@ class Worker(Thread): # Get details {{{
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def parse_authors(self, root):
|
def parse_authors(self, root):
|
||||||
|
matches = CSSSelect('#byline .author .contributorNameID')(root)
|
||||||
|
if matches:
|
||||||
|
authors = [self.totext(x) for x in matches]
|
||||||
|
return [a for a in authors if a]
|
||||||
|
|
||||||
x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
||||||
aname = root.xpath(x)
|
aname = root.xpath(x)
|
||||||
if not aname:
|
if not aname:
|
||||||
@ -420,8 +447,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
# remove all attributes from tags
|
# remove all attributes from tags
|
||||||
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
||||||
# Collapse whitespace
|
# Collapse whitespace
|
||||||
#desc = re.sub('\n+', '\n', desc)
|
# desc = re.sub('\n+', '\n', desc)
|
||||||
#desc = re.sub(' +', ' ', desc)
|
# desc = re.sub(' +', ' ', desc)
|
||||||
# Remove the notice about text referring to out of print editions
|
# Remove the notice about text referring to out of print editions
|
||||||
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
|
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
|
||||||
# Remove comments
|
# Remove comments
|
||||||
@ -429,6 +456,17 @@ class Worker(Thread): # Get details {{{
|
|||||||
return sanitize_comments_html(desc)
|
return sanitize_comments_html(desc)
|
||||||
|
|
||||||
def parse_comments(self, root):
|
def parse_comments(self, root):
|
||||||
|
ns = CSSSelect('#bookDescription_feature_div noscript')(root)
|
||||||
|
if ns:
|
||||||
|
ns = ns[0]
|
||||||
|
if len(ns) == 0 and ns.text:
|
||||||
|
import html5lib
|
||||||
|
# html5lib parsed noscript as CDATA
|
||||||
|
ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
|
||||||
|
else:
|
||||||
|
ns.tag = 'div'
|
||||||
|
return self._render_comments(ns)
|
||||||
|
|
||||||
ans = ''
|
ans = ''
|
||||||
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
|
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
|
||||||
if desc:
|
if desc:
|
||||||
@ -472,6 +510,37 @@ class Worker(Thread): # Get details {{{
|
|||||||
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
|
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
|
||||||
return ('/'.join(parts[:-1]))+'/'+bn
|
return ('/'.join(parts[:-1]))+'/'+bn
|
||||||
|
|
||||||
|
def parse_new_details(self, root, mi, non_hero):
|
||||||
|
table = non_hero.xpath('descendant::table')[0]
|
||||||
|
for tr in table.xpath('descendant::tr'):
|
||||||
|
cells = tr.xpath('descendant::td')
|
||||||
|
if len(cells) == 2:
|
||||||
|
name = self.totext(cells[0])
|
||||||
|
val = self.totext(cells[1])
|
||||||
|
if not val:
|
||||||
|
continue
|
||||||
|
if name in self.language_names:
|
||||||
|
ans = self.lang_map.get(val, None)
|
||||||
|
if not ans:
|
||||||
|
ans = canonicalize_lang(val)
|
||||||
|
if ans:
|
||||||
|
mi.language = ans
|
||||||
|
elif name in self.publisher_names:
|
||||||
|
pub = val.partition(';')[0].partition('(')[0].strip()
|
||||||
|
if pub:
|
||||||
|
mi.publisher = pub
|
||||||
|
date = val.rpartition('(')[-1].replace(')', '').strip()
|
||||||
|
try:
|
||||||
|
from calibre.utils.date import parse_only_date
|
||||||
|
date = self.delocalize_datestr(date)
|
||||||
|
mi.pubdate = parse_only_date(date, assume_utc=True)
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to parse pubdate: %s' % val)
|
||||||
|
elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}:
|
||||||
|
ans = check_isbn(val)
|
||||||
|
if ans:
|
||||||
|
self.isbn = mi.isbn = ans
|
||||||
|
|
||||||
def parse_isbn(self, pd):
|
def parse_isbn(self, pd):
|
||||||
items = pd.xpath(
|
items = pd.xpath(
|
||||||
'descendant::*[starts-with(text(), "ISBN")]')
|
'descendant::*[starts-with(text(), "ISBN")]')
|
||||||
@ -721,9 +790,9 @@ class Amazon(Source):
|
|||||||
|
|
||||||
def title_ok(title):
|
def title_ok(title):
|
||||||
title = title.lower()
|
title = title.lower()
|
||||||
bad = ['bulk pack', '[audiobook]', '[audio cd]']
|
bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )']
|
||||||
if self.domain == 'com':
|
if self.domain == 'com':
|
||||||
bad.append('(spanish edition)')
|
bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
|
||||||
for x in bad:
|
for x in bad:
|
||||||
if x in title:
|
if x in title:
|
||||||
return False
|
return False
|
||||||
@ -901,14 +970,9 @@ if __name__ == '__main__': # tests {{{
|
|||||||
# To run these test use: calibre-debug -e
|
# To run these test use: calibre-debug -e
|
||||||
# src/calibre/ebooks/metadata/sources/amazon.py
|
# src/calibre/ebooks/metadata/sources/amazon.py
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
isbn_test, title_test, authors_test, comments_test, series_test)
|
isbn_test, title_test, authors_test, comments_test)
|
||||||
com_tests = [ # {{{
|
com_tests = [ # {{{
|
||||||
|
|
||||||
( # Has a spanish edition
|
|
||||||
{'title':'11/22/63'},
|
|
||||||
[title_test('11/22/63: A Novel', exact=True), authors_test(['Stephen King']),]
|
|
||||||
),
|
|
||||||
|
|
||||||
( # + in title and uses id="main-image" for cover
|
( # + in title and uses id="main-image" for cover
|
||||||
{'title':'C++ Concurrency in Action'},
|
{'title':'C++ Concurrency in Action'},
|
||||||
[title_test('C++ Concurrency in Action: Practical Multithreading',
|
[title_test('C++ Concurrency in Action: Practical Multithreading',
|
||||||
@ -916,11 +980,10 @@ if __name__ == '__main__': # tests {{{
|
|||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # Series
|
( # noscript description
|
||||||
{'identifiers':{'amazon':'0756407117'}},
|
{'identifiers':{'amazon':'0756407117'}},
|
||||||
[title_test(
|
[title_test(
|
||||||
"Throne of the Crescent Moon",
|
"Throne of the Crescent Moon"),
|
||||||
exact=True), series_test('Crescent Moon Kingdoms', 1),
|
|
||||||
comments_test('Makhslood'),
|
comments_test('Makhslood'),
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
@ -1054,3 +1117,4 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ def astext(node):
|
|||||||
return etree.tostring(node, method='text', encoding=unicode,
|
return etree.tostring(node, method='text', encoding=unicode,
|
||||||
with_tail=False).strip()
|
with_tail=False).strip()
|
||||||
|
|
||||||
class Worker(Thread): # {{{
|
class Worker(Thread): # {{{
|
||||||
|
|
||||||
def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin):
|
def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin):
|
||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
@ -154,8 +154,8 @@ class Worker(Thread): # {{{
|
|||||||
# remove all attributes from tags
|
# remove all attributes from tags
|
||||||
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
||||||
# Collapse whitespace
|
# Collapse whitespace
|
||||||
#desc = re.sub('\n+', '\n', desc)
|
# desc = re.sub('\n+', '\n', desc)
|
||||||
#desc = re.sub(' +', ' ', desc)
|
# desc = re.sub(' +', ' ', desc)
|
||||||
# Remove comments
|
# Remove comments
|
||||||
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
||||||
return sanitize_comments_html(desc)
|
return sanitize_comments_html(desc)
|
||||||
@ -183,14 +183,14 @@ class Edelweiss(Source):
|
|||||||
if sku:
|
if sku:
|
||||||
return 'http://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku
|
return 'http://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku
|
||||||
|
|
||||||
def get_book_url(self, identifiers): # {{{
|
def get_book_url(self, identifiers): # {{{
|
||||||
sku = identifiers.get('edelweiss', None)
|
sku = identifiers.get('edelweiss', None)
|
||||||
if sku:
|
if sku:
|
||||||
return 'edelweiss', sku, self._get_book_url(sku)
|
return 'edelweiss', sku, self._get_book_url(sku)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_cached_cover_url(self, identifiers): # {{{
|
def get_cached_cover_url(self, identifiers): # {{{
|
||||||
sku = identifiers.get('edelweiss', None)
|
sku = identifiers.get('edelweiss', None)
|
||||||
if not sku:
|
if not sku:
|
||||||
isbn = identifiers.get('isbn', None)
|
isbn = identifiers.get('isbn', None)
|
||||||
@ -199,7 +199,7 @@ class Edelweiss(Source):
|
|||||||
return self.cached_identifier_to_cover_url(sku)
|
return self.cached_identifier_to_cover_url(sku)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?'
|
BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?'
|
||||||
params = {
|
params = {
|
||||||
@ -239,9 +239,40 @@ class Edelweiss(Source):
|
|||||||
params[k] = v.encode('utf-8')
|
params[k] = v.encode('utf-8')
|
||||||
|
|
||||||
return BASE_URL+urlencode(params)
|
return BASE_URL+urlencode(params)
|
||||||
|
|
||||||
|
def create_query2(self, log, title=None, authors=None, identifiers={}):
|
||||||
|
''' The edelweiss advanced search appears to be broken, use the keyword search instead, until it is fixed. '''
|
||||||
|
from urllib import urlencode
|
||||||
|
BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?'
|
||||||
|
params = {
|
||||||
|
'group':'search',
|
||||||
|
'section':'CatalogOverview',
|
||||||
|
'searchType':1,
|
||||||
|
'searchOrgID':'',
|
||||||
|
'searchCatalogID': '',
|
||||||
|
'searchMailingID': '',
|
||||||
|
'searchSelect':1,
|
||||||
|
}
|
||||||
|
keywords = []
|
||||||
|
isbn = check_isbn(identifiers.get('isbn', None))
|
||||||
|
if isbn is not None:
|
||||||
|
keywords.append(isbn)
|
||||||
|
elif title or authors:
|
||||||
|
title_tokens = list(self.get_title_tokens(title))
|
||||||
|
if title_tokens:
|
||||||
|
keywords.extend(title_tokens)
|
||||||
|
author_tokens = self.get_author_tokens(authors,
|
||||||
|
only_first_author=True)
|
||||||
|
if author_tokens:
|
||||||
|
keywords.extend(author_tokens)
|
||||||
|
if not keywords:
|
||||||
|
return None
|
||||||
|
params['keywords'] = (' '.join(keywords)).encode('utf-8')
|
||||||
|
return BASE_URL+urlencode(params)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
identifiers={}, timeout=30):
|
identifiers={}, timeout=30):
|
||||||
from urlparse import parse_qs
|
from urlparse import parse_qs
|
||||||
|
|
||||||
@ -251,11 +282,12 @@ class Edelweiss(Source):
|
|||||||
entries = [(book_url, identifiers['edelweiss'])]
|
entries = [(book_url, identifiers['edelweiss'])]
|
||||||
else:
|
else:
|
||||||
entries = []
|
entries = []
|
||||||
query = self.create_query(log, title=title, authors=authors,
|
query = self.create_query2(log, title=title, authors=authors,
|
||||||
identifiers=identifiers)
|
identifiers=identifiers)
|
||||||
if not query:
|
if not query:
|
||||||
log.error('Insufficient metadata to construct query')
|
log.error('Insufficient metadata to construct query')
|
||||||
return
|
return
|
||||||
|
log('Using query URL:', query)
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(query, timeout=timeout).read()
|
raw = br.open_novisit(query, timeout=timeout).read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -270,7 +302,8 @@ class Edelweiss(Source):
|
|||||||
|
|
||||||
for entry in CSSSelect('div.listRow div.listRowMain')(root):
|
for entry in CSSSelect('div.listRow div.listRowMain')(root):
|
||||||
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "ProductDetailPage.aspx")]')
|
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "ProductDetailPage.aspx")]')
|
||||||
if not a: continue
|
if not a:
|
||||||
|
continue
|
||||||
href = a[0].get('href')
|
href = a[0].get('href')
|
||||||
prefix, qs = href.partition('?')[0::2]
|
prefix, qs = href.partition('?')[0::2]
|
||||||
sku = parse_qs(qs).get('sku', None)
|
sku = parse_qs(qs).get('sku', None)
|
||||||
@ -288,7 +321,7 @@ class Edelweiss(Source):
|
|||||||
|
|
||||||
div = CSSSelect('div.format.attGroup')(entry)
|
div = CSSSelect('div.format.attGroup')(entry)
|
||||||
text = astext(div[0]).lower()
|
text = astext(div[0]).lower()
|
||||||
if 'audio' in text or 'mp3' in text: # Audio-book, ignore
|
if 'audio' in text or 'mp3' in text: # Audio-book, ignore
|
||||||
continue
|
continue
|
||||||
entries.append((self._get_book_url(sku), sku))
|
entries.append((self._get_book_url(sku), sku))
|
||||||
|
|
||||||
@ -321,7 +354,7 @@ class Edelweiss(Source):
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def download_cover(self, log, result_queue, abort, # {{{
|
def download_cover(self, log, result_queue, abort, # {{{
|
||||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||||
cached_url = self.get_cached_cover_url(identifiers)
|
cached_url = self.get_cached_cover_url(identifiers)
|
||||||
if cached_url is None:
|
if cached_url is None:
|
||||||
@ -381,7 +414,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
( # Pubdate
|
( # Pubdate
|
||||||
{'title':'The Great Gatsby', 'authors':['F. Scott Fitzgerald']},
|
{'title':'The Great Gatsby', 'authors':['F. Scott Fitzgerald']},
|
||||||
[title_test('The great gatsby', exact=True),
|
[title_test('The great gatsby', exact=True),
|
||||||
authors_test(['F. Scott Fitzgerald']), pubdate_test(2004, 9, 29)]
|
authors_test(['F. Scott Fitzgerald']), pubdate_test(2004, 9, 29)]
|
||||||
@ -395,3 +428,5 @@ if __name__ == '__main__':
|
|||||||
test_identify_plugin(Edelweiss.name, tests)
|
test_identify_plugin(Edelweiss.name, tests)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -106,7 +106,6 @@ def single_identify(title, authors, identifiers):
|
|||||||
r in results], dump_caches(), log.dump()
|
r in results], dump_caches(), log.dump()
|
||||||
|
|
||||||
def single_covers(title, authors, identifiers, caches, tdir):
|
def single_covers(title, authors, identifiers, caches, tdir):
|
||||||
os.chdir(tdir)
|
|
||||||
load_caches(caches)
|
load_caches(caches)
|
||||||
log = GUILog()
|
log = GUILog()
|
||||||
results = Queue()
|
results = Queue()
|
||||||
@ -126,9 +125,9 @@ def single_covers(title, authors, identifiers, caches, tdir):
|
|||||||
name += '{%d}'%c[plugin.name]
|
name += '{%d}'%c[plugin.name]
|
||||||
c[plugin.name] += 1
|
c[plugin.name] += 1
|
||||||
name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt)
|
name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt)
|
||||||
with open(name, 'wb') as f:
|
with open(os.path.join(tdir, name), 'wb') as f:
|
||||||
f.write(data)
|
f.write(data)
|
||||||
os.mkdir(name+'.done')
|
os.mkdir(os.path.join(tdir, name+'.done'))
|
||||||
|
|
||||||
return log.dump()
|
return log.dump()
|
||||||
|
|
||||||
|
@ -110,6 +110,19 @@ class AddAction(InterfaceAction):
|
|||||||
return
|
return
|
||||||
|
|
||||||
db = view.model().db
|
db = view.model().db
|
||||||
|
if len(ids) == 1:
|
||||||
|
formats = db.formats(ids[0], index_is_id=True)
|
||||||
|
if formats:
|
||||||
|
formats = {x.upper() for x in formats.split(',')}
|
||||||
|
nformats = {f.rpartition('.')[-1].upper() for f in books}
|
||||||
|
override = formats.intersection(nformats)
|
||||||
|
if override:
|
||||||
|
title = db.title(ids[0], index_is_id=True)
|
||||||
|
msg = _('The {0} format(s) will be replaced in the book {1}. Are you sure?').format(
|
||||||
|
', '.join(override), title)
|
||||||
|
if not confirm(msg, 'confirm_format_override_on_add', title=_('Are you sure'), parent=self.gui):
|
||||||
|
return
|
||||||
|
|
||||||
for id_ in ids:
|
for id_ in ids:
|
||||||
for fpath in books:
|
for fpath in books:
|
||||||
fmt = os.path.splitext(fpath)[1][1:].upper()
|
fmt = os.path.splitext(fpath)[1][1:].upper()
|
||||||
|
@ -9,8 +9,7 @@ import shutil
|
|||||||
from PyQt4.Qt import QString, SIGNAL
|
from PyQt4.Qt import QString, SIGNAL
|
||||||
|
|
||||||
from calibre.gui2.convert.single import (Config, sort_formats_by_preference,
|
from calibre.gui2.convert.single import (Config, sort_formats_by_preference,
|
||||||
GroupModel, gprefs)
|
GroupModel, gprefs, get_output_formats)
|
||||||
from calibre.customize.ui import available_output_formats
|
|
||||||
from calibre.gui2 import ResizableDialog
|
from calibre.gui2 import ResizableDialog
|
||||||
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
||||||
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
||||||
@ -43,7 +42,6 @@ class BulkConfig(Config):
|
|||||||
'values saved in a previous conversion (if they exist) instead '
|
'values saved in a previous conversion (if they exist) instead '
|
||||||
'of using the defaults specified in the Preferences'))
|
'of using the defaults specified in the Preferences'))
|
||||||
|
|
||||||
|
|
||||||
self.connect(self.output_formats, SIGNAL('currentIndexChanged(QString)'),
|
self.connect(self.output_formats, SIGNAL('currentIndexChanged(QString)'),
|
||||||
self.setup_pipeline)
|
self.setup_pipeline)
|
||||||
self.connect(self.groups, SIGNAL('activated(QModelIndex)'),
|
self.connect(self.groups, SIGNAL('activated(QModelIndex)'),
|
||||||
@ -96,7 +94,8 @@ class BulkConfig(Config):
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
c = self.stack.currentWidget()
|
c = self.stack.currentWidget()
|
||||||
if not c: break
|
if not c:
|
||||||
|
break
|
||||||
self.stack.removeWidget(c)
|
self.stack.removeWidget(c)
|
||||||
|
|
||||||
widgets = [lf, hw, ps, sd, toc, sr]
|
widgets = [lf, hw, ps, sd, toc, sr]
|
||||||
@ -118,17 +117,14 @@ class BulkConfig(Config):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def setup_output_formats(self, db, preferred_output_format):
|
def setup_output_formats(self, db, preferred_output_format):
|
||||||
if preferred_output_format:
|
if preferred_output_format:
|
||||||
preferred_output_format = preferred_output_format.lower()
|
preferred_output_format = preferred_output_format.lower()
|
||||||
output_formats = sorted(available_output_formats(),
|
output_formats = get_output_formats(preferred_output_format)
|
||||||
key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x))
|
|
||||||
output_formats.remove('oeb')
|
|
||||||
preferred_output_format = preferred_output_format if \
|
preferred_output_format = preferred_output_format if \
|
||||||
preferred_output_format and preferred_output_format \
|
preferred_output_format and preferred_output_format \
|
||||||
in output_formats else sort_formats_by_preference(output_formats,
|
in output_formats else sort_formats_by_preference(output_formats,
|
||||||
prefs['output_format'])[0]
|
[prefs['output_format']])[0]
|
||||||
self.output_formats.addItems(list(map(QString, [x.upper() for x in
|
self.output_formats.addItems(list(map(QString, [x.upper() for x in
|
||||||
output_formats])))
|
output_formats])))
|
||||||
self.output_formats.setCurrentIndex(output_formats.index(preferred_output_format))
|
self.output_formats.setCurrentIndex(output_formats.index(preferred_output_format))
|
||||||
@ -149,3 +145,4 @@ class BulkConfig(Config):
|
|||||||
bytearray(self.saveGeometry())
|
bytearray(self.saveGeometry())
|
||||||
return ResizableDialog.done(self, r)
|
return ResizableDialog.done(self, r)
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ from calibre.ebooks.conversion.plumber import (Plumber,
|
|||||||
from calibre.ebooks.conversion.config import delete_specifics
|
from calibre.ebooks.conversion.config import delete_specifics
|
||||||
from calibre.customize.ui import available_output_formats
|
from calibre.customize.ui import available_output_formats
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre.utils.config import prefs
|
from calibre.utils.config import prefs, tweaks
|
||||||
from calibre.utils.logging import Log
|
from calibre.utils.logging import Log
|
||||||
|
|
||||||
class NoSupportedInputFormats(Exception):
|
class NoSupportedInputFormats(Exception):
|
||||||
@ -48,6 +48,20 @@ def sort_formats_by_preference(formats, prefs):
|
|||||||
return len(prefs)
|
return len(prefs)
|
||||||
return sorted(formats, key=key)
|
return sorted(formats, key=key)
|
||||||
|
|
||||||
|
def get_output_formats(preferred_output_format):
|
||||||
|
all_formats = {x.upper() for x in available_output_formats()}
|
||||||
|
all_formats.discard('OEB')
|
||||||
|
pfo = preferred_output_format.upper() if preferred_output_format else ''
|
||||||
|
restrict = tweaks['restrict_output_formats']
|
||||||
|
if restrict:
|
||||||
|
fmts = [x.upper() for x in restrict]
|
||||||
|
if pfo and pfo not in fmts and pfo in all_formats:
|
||||||
|
fmts.append(pfo)
|
||||||
|
else:
|
||||||
|
fmts = list(sorted(all_formats,
|
||||||
|
key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x)))
|
||||||
|
return fmts
|
||||||
|
|
||||||
class GroupModel(QAbstractListModel):
|
class GroupModel(QAbstractListModel):
|
||||||
|
|
||||||
def __init__(self, widgets):
|
def __init__(self, widgets):
|
||||||
@ -239,15 +253,13 @@ class Config(ResizableDialog, Ui_Dialog):
|
|||||||
preferred_output_format):
|
preferred_output_format):
|
||||||
if preferred_output_format:
|
if preferred_output_format:
|
||||||
preferred_output_format = preferred_output_format.lower()
|
preferred_output_format = preferred_output_format.lower()
|
||||||
output_formats = sorted(available_output_formats(),
|
output_formats = get_output_formats(preferred_output_format)
|
||||||
key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x))
|
|
||||||
output_formats.remove('oeb')
|
|
||||||
input_format, input_formats = get_input_format_for_book(db, book_id,
|
input_format, input_formats = get_input_format_for_book(db, book_id,
|
||||||
preferred_input_format)
|
preferred_input_format)
|
||||||
preferred_output_format = preferred_output_format if \
|
preferred_output_format = preferred_output_format if \
|
||||||
preferred_output_format in output_formats else \
|
preferred_output_format in output_formats else \
|
||||||
sort_formats_by_preference(output_formats,
|
sort_formats_by_preference(output_formats,
|
||||||
prefs['output_format'])[0]
|
[prefs['output_format']])[0]
|
||||||
self.input_formats.addItems(list(map(QString, [x.upper() for x in
|
self.input_formats.addItems(list(map(QString, [x.upper() for x in
|
||||||
input_formats])))
|
input_formats])))
|
||||||
self.output_formats.addItems(list(map(QString, [x.upper() for x in
|
self.output_formats.addItems(list(map(QString, [x.upper() for x in
|
||||||
|
@ -34,6 +34,19 @@ def app_prefix(prefix):
|
|||||||
return '%s_'%__appname__
|
return '%s_'%__appname__
|
||||||
return '%s_%s_%s'%(__appname__, __version__, prefix)
|
return '%s_%s_%s'%(__appname__, __version__, prefix)
|
||||||
|
|
||||||
|
def reset_temp_folder_permissions():
|
||||||
|
# There are some broken windows installs where the permissions for the temp
|
||||||
|
# folder are set to not be executable, which means chdir() into temp
|
||||||
|
# folders fails. Try to fix that by resetting the permissions on the temp
|
||||||
|
# folder.
|
||||||
|
global _base_dir
|
||||||
|
if iswindows and _base_dir:
|
||||||
|
import subprocess
|
||||||
|
from calibre import prints
|
||||||
|
parent = os.path.dirname(_base_dir)
|
||||||
|
retcode = subprocess.Popen(['icacls.exe', parent, '/reset', '/Q', '/T']).wait()
|
||||||
|
prints('Trying to reset permissions of temp folder', parent, 'return code:', retcode)
|
||||||
|
|
||||||
def base_dir():
|
def base_dir():
|
||||||
global _base_dir
|
global _base_dir
|
||||||
if _base_dir is not None and not os.path.exists(_base_dir):
|
if _base_dir is not None and not os.path.exists(_base_dir):
|
||||||
|
@ -145,8 +145,11 @@ def download_resources(browser, resource_cache, output_dir):
|
|||||||
elem.removeFromDocument()
|
elem.removeFromDocument()
|
||||||
|
|
||||||
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
|
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
|
||||||
html = strip_encoding_declarations(browser.html)
|
|
||||||
import html5lib
|
import html5lib
|
||||||
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
html = strip_encoding_declarations(browser.html)
|
||||||
|
if isinstance(html, unicode):
|
||||||
|
html = clean_xml_chars(html)
|
||||||
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
|
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
|
||||||
root = postprocess_html(root, url, recursion_level)
|
root = postprocess_html(root, url, recursion_level)
|
||||||
if root is None:
|
if root is None:
|
||||||
|
@ -571,7 +571,7 @@ class Browser(QObject, FormsMixin):
|
|||||||
ans[url] = raw
|
ans[url] = raw
|
||||||
urls.discard(url)
|
urls.discard(url)
|
||||||
|
|
||||||
while urls and time.time() - start_time > timeout and self.page.ready_state not in {'complete', 'completed'}:
|
while urls and time.time() - start_time < timeout and self.page.ready_state not in {'complete', 'completed'}:
|
||||||
get_resources()
|
get_resources()
|
||||||
if urls:
|
if urls:
|
||||||
self.run_for_a_time(0.1)
|
self.run_for_a_time(0.1)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user