Merge from trunk

This commit is contained in:
Charles Haley 2012-11-07 18:16:15 +01:00
commit 9868fffb02
29 changed files with 1287 additions and 578 deletions

View File

@ -35,3 +35,7 @@ nbproject/
.settings/ .settings/
*.DS_Store *.DS_Store
calibre_plugins/ calibre_plugins/
recipes/.git
recipes/.gitignore
recipes/README
recipes/katalog_egazeciarz.recipe

View File

@ -327,9 +327,8 @@ You can browse your |app| collection on your Android device is by using the
calibre content server, which makes your collection available over the net. calibre content server, which makes your collection available over the net.
First perform the following steps in |app| First perform the following steps in |app|
* Set the :guilabel:`Preferred Output Format` in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) * Set the :guilabel:`Preferred Output Format` in |app| to EPUB for normal Android devices or MOBI for Kindles (The output format can be set under :guilabel:`Preferences->Interface->Behavior`)
* Set the output profile to Tablet (this will work for phones as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` * Convert the books you want to read on your device to EPUB/MOBI format by selecting them and clicking the Convert button.
* Convert the books you want to read on your device to EPUB format by selecting them and clicking the Convert button.
* Turn on the Content Server in |app|'s preferences and leave |app| running. * Turn on the Content Server in |app|'s preferences and leave |app| running.
Now on your Android device, open the browser and browse to Now on your Android device, open the browser and browse to

View File

@ -2,7 +2,9 @@ import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class FocusRecipe(BasicNewsRecipe): class FocusRecipe(BasicNewsRecipe):
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = u'intromatyk <intromatyk@gmail.com>' __author__ = u'intromatyk <intromatyk@gmail.com>'
language = 'pl' language = 'pl'
@ -12,10 +14,10 @@ class FocusRecipe(BasicNewsRecipe):
publisher = u'Gruner + Jahr Polska' publisher = u'Gruner + Jahr Polska'
category = u'News' category = u'News'
description = u'Newspaper' description = u'Newspaper'
category='magazine' category = 'magazine'
cover_url='' cover_url = ''
remove_empty_feeds= True remove_empty_feeds = True
no_stylesheets=True no_stylesheets = True
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100000 max_articles_per_feed = 100000
recursions = 0 recursions = 0
@ -27,15 +29,15 @@ class FocusRecipe(BasicNewsRecipe):
simultaneous_downloads = 5 simultaneous_downloads = 5
r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*') r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
keep_only_tags =[] keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'})) keep_only_tags.append(dict(name='div', attrs={'id': 'cll'}))
remove_tags =[] remove_tags = []
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'})) remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'})) remove_tags.append(dict(name='div', attrs={'class': 'txb'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'})) remove_tags.append(dict(name='div', attrs={'class': 'h2'}))
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'})) remove_tags.append(dict(name='ul', attrs={'class': 'txu'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'})) remove_tags.append(dict(name='div', attrs={'class': 'ulc'}))
extra_css = ''' extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
@ -44,18 +46,17 @@ class FocusRecipe(BasicNewsRecipe):
p.lead {font-weight: bold; text-align: left;} p.lead {font-weight: bold; text-align: left;}
.authordate {font-size: small; color: #696969;} .authordate {font-size: small; color: #696969;}
.fot{font-size: x-small; color: #666666;} .fot{font-size: x-small; color: #666666;}
''' '''
feeds = [
feeds = [ ('Nauka', 'http://www.focus.pl/nauka/rss/'),
('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'), ('Historia', 'http://www.focus.pl/historia/rss/'),
('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), ('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'),
('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), ('Sport', 'http://www.focus.pl/sport/rss/'),
('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), ('Technika', 'http://www.focus.pl/technika/rss/'),
('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), ('Przyroda', 'http://www.focus.pl/przyroda/rss/'),
('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), ('Technologie', 'http://www.focus.pl/gadzety/rss/')
('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'), ]
]
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
if ('advertisement' in soup.find('title').string.lower()): if ('advertisement' in soup.find('title').string.lower()):
@ -65,20 +66,20 @@ class FocusRecipe(BasicNewsRecipe):
return None return None
def get_cover_url(self): def get_cover_url(self):
soup=self.index_to_soup('http://www.focus.pl/magazyn/') soup = self.index_to_soup('http://www.focus.pl/magazyn/')
tag=soup.find(name='div', attrs={'class':'clr fl'}) tag = soup.find(name='div', attrs={'class': 'clr fl'})
if tag: if tag:
self.cover_url='http://www.focus.pl/' + tag.a['href'] self.cover_url = 'http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def print_version(self, url): def print_version(self, url):
if url.count ('focus.pl.feedsportal.com'): if url.count('focus.pl.feedsportal.com'):
u = url.find('focus0Bpl') u = url.find('focus0Bpl')
u = 'http://www.focus.pl/' + url[u + 11:] u = 'http://www.focus.pl/' + url[u + 11:]
u = u.replace('0C', '/') u = u.replace('0C', '/')
u = u.replace('A', '') u = u.replace('A', '')
u = u.replace ('0E','-') u = u.replace('0E', '-')
u = u.replace('/nc/1//story01.htm', '/do-druku/1') u = u.replace('/nc/1//story01.htm', '/do-druku/1')
else: else:
u = url.replace('/nc/1','/do-druku/1') u = url.replace('/nc/1', '/do-druku/1')
return u return u

View File

@ -1,104 +1,107 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Gazeta_Wyborcza(BasicNewsRecipe): class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta Wyborcza' title = u'Gazeta Wyborcza'
__author__ = 'fenuks' __author__ = 'fenuks, Artur Stachecki'
language = 'pl' language = 'pl'
description ='news from gazeta.pl' description = 'news from gazeta.pl'
category='newspaper' category = 'newspaper'
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
INDEX='http://wyborcza.pl' INDEX = 'http://wyborcza.pl'
remove_empty_feeds= True remove_empty_feeds = True
oldest_article = 3 oldest_article = 3
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript=True remove_javascript = True
no_stylesheets=True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'} remove_tags_before = dict(id='k0')
keep_only_tags = dict(id=['gazeta_article', 'article']) remove_tags_after = dict(id='banP4')
remove_tags_after = dict(id='gazeta_article_share') remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])] feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), ]
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'),
(u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'),
(u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'),
#(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'),
(u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'),
(u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'),
(u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'),
(u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'),
(u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'),
(u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'),
(u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
(u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss')
]
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
tag=soup.find(name='a', attrs={'class':'btn'}) tag = soup.find(name='a', attrs={'class': 'btn'})
if tag: if tag:
new_soup=self.index_to_soup(tag['href'], raw=True) new_soup = self.index_to_soup(tag['href'], raw=True)
return new_soup return new_soup
def append_page(self, soup, appendtag): def append_page(self, soup, appendtag):
loop=False loop = False
tag = soup.find('div', attrs={'id':'Str'}) tag = soup.find('div', attrs={'id': 'Str'})
if appendtag.find('div', attrs={'id':'Str'}): if appendtag.find('div', attrs={'id': 'Str'}):
nexturl=tag.findAll('a') nexturl = tag.findAll('a')
appendtag.find('div', attrs={'id':'Str'}).extract() appendtag.find('div', attrs={'id': 'Str'}).extract()
loop=True loop = True
if appendtag.find(id='source'): if appendtag.find(id='source'):
appendtag.find(id='source').extract() appendtag.find(id='source').extract()
while loop: while loop:
loop=False loop = False
for link in nexturl: for link in nexturl:
if u'następne' in link.string: if u'następne' in link.string:
url= self.INDEX + link['href'] url = self.INDEX + link['href']
soup2 = self.index_to_soup(url) soup2 = self.index_to_soup(url)
pagetext = soup2.find(id='artykul') pagetext = soup2.find(id='artykul')
pos = len(appendtag.contents) pos = len(appendtag.contents)
appendtag.insert(pos, pagetext) appendtag.insert(pos, pagetext)
tag = soup2.find('div', attrs={'id':'Str'}) tag = soup2.find('div', attrs={'id': 'Str'})
nexturl=tag.findAll('a') nexturl = tag.findAll('a')
loop=True loop = True
def gallery_article(self, appendtag): def gallery_article(self, appendtag):
tag=appendtag.find(id='container_gal') tag = appendtag.find(id='container_gal')
if tag: if tag:
nexturl=appendtag.find(id='gal_btn_next').a['href'] nexturl = appendtag.find(id='gal_btn_next').a['href']
appendtag.find(id='gal_navi').extract() appendtag.find(id='gal_navi').extract()
while nexturl: while nexturl:
soup2=self.index_to_soup(nexturl) soup2 = self.index_to_soup(nexturl)
pagetext=soup2.find(id='container_gal') pagetext = soup2.find(id='container_gal')
nexturl=pagetext.find(id='gal_btn_next') nexturl = pagetext.find(id='gal_btn_next')
if nexturl: if nexturl:
nexturl=nexturl.a['href'] nexturl = nexturl.a['href']
pos = len(appendtag.contents) pos = len(appendtag.contents)
appendtag.insert(pos, pagetext) appendtag.insert(pos, pagetext)
rem=appendtag.find(id='gal_navi') rem = appendtag.find(id='gal_navi')
if rem: if rem:
rem.extract() rem.extract()
def preprocess_html(self, soup): def preprocess_html(self, soup):
self.append_page(soup, soup.body) if soup.find(attrs={'class': 'piano_btn_1'}):
if soup.find(id='container_gal'): return None
self.gallery_article(soup.body) else:
return soup self.append_page(soup, soup.body)
if soup.find(id='container_gal'):
self.gallery_article(soup.body)
return soup
def print_version(self, url): def print_version(self, url):
if 'http://wyborcza.biz/biznes/' not in url: if url.count('rss.feedsportal.com'):
return url u = url.find('wyborcza0Bpl')
u = 'http://www.wyborcza.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace('0E', '-')
u = u.replace('0H', ',')
u = u.replace('0I', '_')
u = u.replace('0B', '.')
u = u.replace('/1,', '/2029020,')
u = u.replace('/story01.htm', '')
print(u)
return u
elif 'http://wyborcza.pl/1' in url:
return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020')
else: else:
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
cover=soup.find(id='GWmini2') cover = soup.find(id='GWmini2')
soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href'])
self.cover_url='http://wyborcza.pl' + soup.img['src'] self.cover_url = 'http://wyborcza.pl' + soup.img['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)

View File

@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class FocusRecipe(BasicNewsRecipe): class FocusRecipe(BasicNewsRecipe):
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = u'intromatyk <intromatyk@gmail.com>' __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
language = 'pl' language = 'pl'
version = 1 version = 1

View File

@ -34,16 +34,20 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'story'})) keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'story'}))
remove_tags =[] remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleLeftBox'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'socialNewTools'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'socialTools'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'socialTools'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxTop'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxTop'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'clr'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'clr'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'recommendations'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'recommendations'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'editorPicks'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks editorPicksFirst'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightText'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightText'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightButton'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightButton'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxBottom'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxBottom'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'more'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'more'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'addRecommendation'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'addRecommendation'}))
remove_tags.append(dict(name = 'h3', attrs = {'id' : 'tags'}))
extra_css = ''' extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
@ -67,3 +71,4 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
return start + '/' + index + '?print=tak' return start + '/' + index + '?print=tak'

View File

@ -1,34 +1,55 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class tvn24(BasicNewsRecipe): class tvn24(BasicNewsRecipe):
title = u'TVN24' title = u'TVN24'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'fenuks' __author__ = 'fenuks, Artur Stachecki'
description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata' description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata'
category = 'news' category = 'news'
language = 'pl' language = 'pl'
#masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.userlogos.org/files/logos/Struna/TVN24.jpg' cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
extra_css = 'ul {list-style:none;} \ extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
li {list-style:none; float: left; margin: 0 0.15em;} \
h2 {font-size: medium} \
.date60m {float: left; margin: 0 10px 0 5px;}'
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False keep_only_tags=[
ignore_duplicate_articles = {'title', 'url'} # dict(name='h1', attrs={'class':'size38 mt20 pb20'}),
keep_only_tags=[dict(name='h1', attrs={'class':['size30 mt10 pb10', 'size38 mt10 pb15']}), dict(name='figure', attrs={'class':'articleMainPhoto articleMainPhotoWide'}), dict(name='article', attrs={'class':['mb20', 'mb20 textArticleDefault']}), dict(name='ul', attrs={'class':'newsItem'})] dict(name='div', attrs={'class':'mainContainer'}),
remove_tags = [dict(name='aside', attrs={'class':['innerArticleModule onRight cols externalContent', 'innerArticleModule center']}), dict(name='div', attrs={'class':['thumbsGallery', 'articleTools', 'article right rd7', 'heading', 'quizContent']}), dict(name='a', attrs={'class':'watchMaterial text'}), dict(name='section', attrs={'class':['quiz toCenter', 'quiz toRight']})] # dict(name='p'),
# dict(attrs={'class':['size18 mt10 mb15', 'bold topicSize1', 'fromUsers content', 'textArticleDefault']})
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), ]
(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')] remove_tags=[
dict(attrs={'class':['commentsInfo', 'textSize', 'related newsNews align-right', 'box', 'watchMaterial text', 'related galleryGallery align-center', 'advert block-alignment-right', 'userActions', 'socialBookmarks', 'im yourArticle fl', 'dynamicButton addComment fl', 'innerArticleModule onRight cols externalContent', 'thumbsGallery', 'relatedObject customBlockquote align-right', 'lead', 'mainRightColumn', 'articleDateContainer borderGreyBottom', 'socialMediaContainer onRight loaded', 'quizContent', 'twitter', 'facebook', 'googlePlus', 'share', 'voteResult', 'reportTitleBar bgBlue_v4 mb15', 'innerVideoModule center']}),
dict(name='article', attrs={'class':['singleArtPhotoCenter', 'singleArtPhotoRight', 'singleArtPhotoLeft']}),
dict(name='section', attrs={'id':['forum', 'innerArticle', 'quiz toCenter', 'mb20']}),
dict(name='div', attrs={'class':'socialMediaContainer big p20 mb20 borderGrey loaded'})
]
remove_tags_after=[dict(name='li', attrs={'class':'share'})]
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), ]
#(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
tag = soup.find(name='ul', attrs={'class':'newsItem'}) return soup
if tag:
tag.name='div' def preprocess_html(self, soup):
tag.li.name='div' for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
def postprocess_html(self, soup, first):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup return soup

View File

@ -3,6 +3,8 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = '2010, matek09, matek09@gmail.com'
__copyright__ = 'Modified 2011, Mariusz Wolek <mariusz_dot_wolek @ gmail dot com>' __copyright__ = 'Modified 2011, Mariusz Wolek <mariusz_dot_wolek @ gmail dot com>'
__copyright__ = 'Modified 2012, Artur Stachecki <artur.stachecki@gmail.com>'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
@ -11,7 +13,7 @@ class Wprost(BasicNewsRecipe):
EDITION = 0 EDITION = 0
FIND_LAST_FULL_ISSUE = True FIND_LAST_FULL_ISSUE = True
EXCLUDE_LOCKED = True EXCLUDE_LOCKED = True
ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png'
title = u'Wprost' title = u'Wprost'
__author__ = 'matek09' __author__ = 'matek09'
@ -20,6 +22,7 @@ class Wprost(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
language = 'pl' language = 'pl'
remove_javascript = True remove_javascript = True
recursions = 0
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
@ -35,13 +38,15 @@ class Wprost(BasicNewsRecipe):
(re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''), (re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''),
(re.compile(r'\<table .*?\>'), lambda match: ''), (re.compile(r'\<table .*?\>'), lambda match: ''),
(re.compile(r'\<tr>'), lambda match: ''), (re.compile(r'\<tr>'), lambda match: ''),
(re.compile(r'\<td .*?\>'), lambda match: '')] (re.compile(r'\<td .*?\>'), lambda match: ''),
(re.compile(r'\<div id="footer"\>.*?\</footer\>'), lambda match: '')]
remove_tags =[] remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'}))
extra_css = ''' extra_css = '''
.div-header {font-size: x-small; font-weight: bold} .div-header {font-size: x-small; font-weight: bold}
''' '''
@ -59,27 +64,26 @@ class Wprost(BasicNewsRecipe):
a = 0 a = 0
if self.FIND_LAST_FULL_ISSUE: if self.FIND_LAST_FULL_ISSUE:
ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED})
a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)})
else: else:
a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) a = soup.find('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)})
self.EDITION = a['href'].replace('/tygodnik/?I=', '') self.EDITION = a['href'].replace('/tygodnik/?I=', '')
self.cover_url = a.img['src'] self.EDITION_SHORT = a['href'].replace('/tygodnik/?I=15', '')
self.cover_url = a.img['src']
def parse_index(self): def parse_index(self):
self.find_last_issue() self.find_last_issue()
soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION)
feeds = [] feeds = []
for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}): for main_block in soup.findAll(attrs={'id': 'content-main-column-element-content'}):
articles = list(self.find_articles(main_block)) articles = list(self.find_articles(main_block))
if len(articles) > 0: if len(articles) > 0:
section = self.tag_to_string(main_block) section = self.tag_to_string(main_block.find('h3'))
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
def find_articles(self, main_block): def find_articles(self, main_block):
for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}): for a in main_block.findAll('a'):
if a.name in "td": if a.name in "td":
break break
if self.EXCLUDE_LOCKED & self.is_blocked(a): if self.EXCLUDE_LOCKED & self.is_blocked(a):
@ -91,3 +95,4 @@ class Wprost(BasicNewsRecipe):
'description' : '' 'description' : ''
} }

View File

@ -901,8 +901,11 @@ class Device(DeviceConfig, DevicePlugin):
for d in drives: for d in drives:
try: try:
winutil.eject_drive(bytes(d)[0]) winutil.eject_drive(bytes(d)[0])
except: except Exception as e:
pass try:
prints(as_unicode(e))
except:
pass
t = Thread(target=do_it, args=[drives]) t = Thread(target=do_it, args=[drives])
t.daemon = True t.daemon = True

View File

@ -150,8 +150,15 @@ class EPUBInput(InputFormatPlugin):
from calibre import walk from calibre import walk
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
zf = ZipFile(stream) try:
zf.extractall(os.getcwdu()) zf = ZipFile(stream)
zf.extractall(os.getcwdu())
except:
log.exception('EPUB appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
from calibre.utils.localunzip import extractall
stream.seek(0)
extractall(stream)
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
opf = self.find_opf() opf = self.find_opf()
if opf is None: if opf is None:

View File

@ -10,6 +10,7 @@ from cStringIO import StringIO
from contextlib import closing from contextlib import closing
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
from calibre.utils.localunzip import LocalZipFile
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
@ -105,10 +106,13 @@ class OCFReader(OCF):
class OCFZipReader(OCFReader): class OCFZipReader(OCFReader):
def __init__(self, stream, mode='r', root=None): def __init__(self, stream, mode='r', root=None):
try: if isinstance(stream, (LocalZipFile, ZipFile)):
self.archive = ZipFile(stream, mode=mode) self.archive = stream
except BadZipfile: else:
raise EPubException("not a ZIP .epub OCF container") try:
self.archive = ZipFile(stream, mode=mode)
except BadZipfile:
raise EPubException("not a ZIP .epub OCF container")
self.root = root self.root = root
if self.root is None: if self.root is None:
name = getattr(stream, 'name', False) name = getattr(stream, 'name', False)
@ -119,8 +123,18 @@ class OCFZipReader(OCFReader):
super(OCFZipReader, self).__init__() super(OCFZipReader, self).__init__()
def open(self, name, mode='r'): def open(self, name, mode='r'):
if isinstance(self.archive, LocalZipFile):
return self.archive.open(name)
return StringIO(self.archive.read(name)) return StringIO(self.archive.read(name))
def get_zip_reader(stream, root=None):
try:
zf = ZipFile(stream, mode='r')
except:
stream.seek(0)
zf = LocalZipFile(stream)
return OCFZipReader(zf, root=root)
class OCFDirReader(OCFReader): class OCFDirReader(OCFReader):
def __init__(self, path): def __init__(self, path):
self.root = path self.root = path
@ -184,7 +198,12 @@ def render_cover(opf, opf_path, zf, reader=None):
def get_cover(opf, opf_path, stream, reader=None): def get_cover(opf, opf_path, stream, reader=None):
raster_cover = opf.raster_cover raster_cover = opf.raster_cover
stream.seek(0) stream.seek(0)
zf = ZipFile(stream) try:
zf = ZipFile(stream)
except:
stream.seek(0)
zf = LocalZipFile(stream)
if raster_cover: if raster_cover:
base = posixpath.dirname(opf_path) base = posixpath.dirname(opf_path)
cpath = posixpath.normpath(posixpath.join(base, raster_cover)) cpath = posixpath.normpath(posixpath.join(base, raster_cover))
@ -207,7 +226,7 @@ def get_cover(opf, opf_path, stream, reader=None):
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
""" Return metadata as a :class:`Metadata` object """ """ Return metadata as a :class:`Metadata` object """
stream.seek(0) stream.seek(0)
reader = OCFZipReader(stream) reader = get_zip_reader(stream)
mi = reader.opf.to_book_metadata() mi = reader.opf.to_book_metadata()
if extract_cover: if extract_cover:
try: try:
@ -232,7 +251,7 @@ def _write_new_cover(new_cdata, cpath):
def set_metadata(stream, mi, apply_null=False, update_timestamp=False): def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
stream.seek(0) stream.seek(0)
reader = OCFZipReader(stream, root=os.getcwdu()) reader = get_zip_reader(stream, root=os.getcwdu())
raster_cover = reader.opf.raster_cover raster_cover = reader.opf.raster_cover
mi = MetaInformation(mi) mi = MetaInformation(mi)
new_cdata = None new_cdata = None
@ -283,7 +302,11 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
reader.opf.timestamp = mi.timestamp reader.opf.timestamp = mi.timestamp
newopf = StringIO(reader.opf.render()) newopf = StringIO(reader.opf.render())
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf, if isinstance(reader.archive, LocalZipFile):
reader.archive.safe_replace(reader.container[OPF.MIMETYPE], newopf,
extra_replacements=replacements)
else:
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf,
extra_replacements=replacements) extra_replacements=replacements)
try: try:
if cpath is not None: if cpath is not None:

View File

@ -239,10 +239,11 @@ class PluginWidget(QWidget,Ui_Form):
def initialize(self, name, db): def initialize(self, name, db):
''' '''
CheckBoxControls (c_type: check_box): CheckBoxControls (c_type: check_box):
['generate_titles','generate_series','generate_genres', ['cross_reference_authors',
'generate_recently_added','generate_descriptions','include_hr'] 'generate_titles','generate_series','generate_genres',
'generate_recently_added','generate_descriptions',
'include_hr']
ComboBoxControls (c_type: combo_box): ComboBoxControls (c_type: combo_box):
['exclude_source_field','header_note_source_field', ['exclude_source_field','header_note_source_field',
'merge_source_field'] 'merge_source_field']

View File

@ -305,7 +305,7 @@ The default pattern \[.+\]|\+ excludes tags of the form [tag], e.g., [Test book]
<string>Other options</string> <string>Other options</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout_3"> <layout class="QGridLayout" name="gridLayout_3">
<item row="2" column="1"> <item row="3" column="1">
<layout class="QHBoxLayout" name="merge_with_comments_hl"> <layout class="QHBoxLayout" name="merge_with_comments_hl">
<item> <item>
<widget class="QComboBox" name="merge_source_field"> <widget class="QComboBox" name="merge_source_field">
@ -372,7 +372,7 @@ The default pattern \[.+\]|\+ excludes tags of the form [tag], e.g., [Test book]
</item> </item>
</layout> </layout>
</item> </item>
<item row="2" column="0"> <item row="3" column="0">
<widget class="QLabel" name="label_9"> <widget class="QLabel" name="label_9">
<property name="minimumSize"> <property name="minimumSize">
<size> <size>
@ -397,7 +397,7 @@ The default pattern \[.+\]|\+ excludes tags of the form [tag], e.g., [Test book]
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="0"> <item row="1" column="0">
<widget class="QLabel" name="label_4"> <widget class="QLabel" name="label_4">
<property name="minimumSize"> <property name="minimumSize">
<size> <size>
@ -413,7 +413,7 @@ The default pattern \[.+\]|\+ excludes tags of the form [tag], e.g., [Test book]
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="1"> <item row="1" column="1">
<layout class="QHBoxLayout" name="replace_cover_hl"> <layout class="QHBoxLayout" name="replace_cover_hl">
<item> <item>
<widget class="QRadioButton" name="generate_new_cover"> <widget class="QRadioButton" name="generate_new_cover">
@ -447,7 +447,7 @@ The default pattern \[.+\]|\+ excludes tags of the form [tag], e.g., [Test book]
</item> </item>
</layout> </layout>
</item> </item>
<item row="1" column="0"> <item row="2" column="0">
<widget class="QLabel" name="label_3"> <widget class="QLabel" name="label_3">
<property name="text"> <property name="text">
<string>E&amp;xtra Description note:</string> <string>E&amp;xtra Description note:</string>
@ -460,7 +460,7 @@ The default pattern \[.+\]|\+ excludes tags of the form [tag], e.g., [Test book]
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="1"> <item row="2" column="1">
<layout class="QHBoxLayout" name="horizontalLayout"> <layout class="QHBoxLayout" name="horizontalLayout">
<item> <item>
<widget class="QComboBox" name="header_note_source_field"> <widget class="QComboBox" name="header_note_source_field">
@ -561,6 +561,27 @@ The default pattern \[.+\]|\+ excludes tags of the form [tag], e.g., [Test book]
</item> </item>
</layout> </layout>
</item> </item>
<item row="0" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>Author cross-references:</string>
</property>
<property name="alignment">
<set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
</property>
</widget>
</item>
<item row="0" column="1">
<layout class="QHBoxLayout" name="cross_references_hl">
<item>
<widget class="QCheckBox" name="cross_reference_authors">
<property name="text">
<string>For books with multiple authors, list each author separately</string>
</property>
</widget>
</item>
</layout>
</item>
</layout> </layout>
</widget> </widget>
</item> </item>

View File

@ -6,102 +6,19 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from contextlib import closing from calibre.gui2.store.stores.amazon_uk_plugin import AmazonUKKindleStore
from lxml import html class AmazonDEKindleStore(AmazonUKKindleStore):
from PyQt4.Qt import QUrl
from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
class AmazonDEKindleStore(StorePlugin):
''' '''
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
def open(self, parent=None, detail_item=None, external=False): aff_id = {'tag': 'charhale0a-21'}
aff_id = {'tag': 'charhale0a-21'} store_link = ('http://www.amazon.de/gp/redirect.html?ie=UTF8&site-redirect=de'
store_link = ('http://www.amazon.de/gp/redirect.html?ie=UTF8&site-redirect=de' '&tag=%(tag)s&linkCode=ur2&camp=1638&creative=19454'
'&tag=%(tag)s&linkCode=ur2&camp=1638&creative=19454' '&location=http://www.amazon.de/ebooks-kindle/b?node=530886031')
'&location=http://www.amazon.de/ebooks-kindle/b?node=530886031') % aff_id store_link_details = ('http://www.amazon.de/gp/redirect.html?ie=UTF8'
if detail_item:
aff_id['asin'] = detail_item
store_link = ('http://www.amazon.de/gp/redirect.html?ie=UTF8'
'&location=http://www.amazon.de/dp/%(asin)s&site-redirect=de' '&location=http://www.amazon.de/dp/%(asin)s&site-redirect=de'
'&tag=%(tag)s&linkCode=ur2&camp=1638&creative=6742') % aff_id '&tag=%(tag)s&linkCode=ur2&camp=1638&creative=6742')
open_url(QUrl(store_link)) search_url = 'http://www.amazon.de/s/?url=search-alias%3Ddigital-text&field-keywords='
def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.de/s/?url=search-alias%3Ddigital-text&field-keywords='
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
# doc = html.fromstring(f.read().decode('latin-1', 'replace'))
# Apparently amazon Europe is responding in UTF-8 now
doc = html.fromstring(f.read())
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
format_xpath = './/span[@class="format"]/text()'
cover_xpath = './/img[@class="productImage"]/@src'
for data in doc.xpath(data_xpath):
if counter <= 0:
break
# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). So we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue
# We must have an asin otherwise we can't easily reference the
# book later.
asin = ''.join(data.xpath("@name"))
cover_url = ''.join(data.xpath(cover_xpath))
title = ''.join(data.xpath('.//a[@class="title"]/text()'))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span[contains(@class, "price")]/text()'))
author = ''.join(data.xpath('.//h3[@class="title"]/span[@class="ptBrand"]/text()'))
if author.startswith('von '):
author = author[4:]
counter -= 1
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'
yield s
def get_details(self, search_result, timeout):
drm_search_text = u'Gleichzeitige Verwendung von Geräten'
drm_free_text = u'Keine Einschränkung'
url = 'http://amazon.de/dp/'
br = browser()
with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
drm_search_text + '")])'):
if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
drm_free_text + '") and contains(b, "' +
drm_search_text + '")])'):
search_result.drm = SearchResult.DRM_UNLOCKED
else:
search_result.drm = SearchResult.DRM_UNKNOWN
else:
search_result.drm = SearchResult.DRM_LOCKED
return True

View File

@ -6,78 +6,17 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from contextlib import closing from calibre.gui2.store.stores.amazon_uk_plugin import AmazonUKKindleStore
from lxml import html class AmazonESKindleStore(AmazonUKKindleStore):
from PyQt4.Qt import QUrl
from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
class AmazonESKindleStore(StorePlugin):
''' '''
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
def open(self, parent=None, detail_item=None, external=False): aff_id = {'tag': 'charhale09-21'}
aff_id = {'tag': 'charhale09-21'} store_link = ('http://www.amazon.es/ebooks-kindle/b?_encoding=UTF8&'
store_link = 'http://www.amazon.es/ebooks-kindle/b?_encoding=UTF8&node=827231031&tag=%(tag)s&ie=UTF8&linkCode=ur2&camp=3626&creative=24790' % aff_id 'node=827231031&tag=%(tag)s&ie=UTF8&linkCode=ur2&camp=3626&creative=24790')
if detail_item: store_link_details = ('http://www.amazon.es/gp/redirect.html?ie=UTF8&'
aff_id['asin'] = detail_item 'location=http://www.amazon.es/dp/%(asin)s&tag=%(tag)s'
store_link = 'http://www.amazon.es/gp/redirect.html?ie=UTF8&location=http://www.amazon.es/dp/%(asin)s&tag=%(tag)s&linkCode=ur2&camp=3626&creative=24790' % aff_id '&linkCode=ur2&camp=3626&creative=24790')
open_url(QUrl(store_link)) search_url = 'http://www.amazon.es/s/?url=search-alias%3Ddigital-text&field-keywords='
def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.es/s/?url=search-alias%3Ddigital-text&field-keywords='
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
# doc = html.fromstring(f.read().decode('latin-1', 'replace'))
# Apparently amazon Europe is responding in UTF-8 now
doc = html.fromstring(f.read())
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
format_xpath = './/span[@class="format"]/text()'
cover_xpath = './/img[@class="productImage"]/@src'
for data in doc.xpath(data_xpath):
if counter <= 0:
break
# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). So we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue
# We must have an asin otherwise we can't easily reference the
# book later.
asin = ''.join(data.xpath("@name"))
cover_url = ''.join(data.xpath(cover_xpath))
title = ''.join(data.xpath('.//a[@class="title"]/text()'))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span[contains(@class, "price")]/text()'))
author = unicode(''.join(data.xpath('.//h3[@class="title"]/span[@class="ptBrand"]/text()')))
if author.startswith('de '):
author = author[3:]
counter -= 1
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'
s.drm = SearchResult.DRM_UNKNOWN
yield s

View File

@ -6,79 +6,16 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from contextlib import closing
from lxml import html from calibre.gui2.store.stores.amazon_uk_plugin import AmazonUKKindleStore
from PyQt4.Qt import QUrl class AmazonFRKindleStore(AmazonUKKindleStore):
from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
class AmazonFRKindleStore(StorePlugin):
''' '''
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
def open(self, parent=None, detail_item=None, external=False): aff_id = {'tag': 'charhale-21'}
aff_id = {'tag': 'charhale-21'} store_link = 'http://www.amazon.fr/livres-kindle/b?ie=UTF8&node=695398031&ref_=sa_menu_kbo1&_encoding=UTF8&tag=%(tag)s&linkCode=ur2&camp=1642&creative=19458' % aff_id
store_link = 'http://www.amazon.fr/livres-kindle/b?ie=UTF8&node=695398031&ref_=sa_menu_kbo1&_encoding=UTF8&tag=%(tag)s&linkCode=ur2&camp=1642&creative=19458' % aff_id store_link_details = 'http://www.amazon.fr/gp/redirect.html?ie=UTF8&location=http://www.amazon.fr/dp/%(asin)s&tag=%(tag)s&linkCode=ur2&camp=1634&creative=6738'
search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords='
if detail_item:
aff_id['asin'] = detail_item
store_link = 'http://www.amazon.fr/gp/redirect.html?ie=UTF8&location=http://www.amazon.fr/dp/%(asin)s&tag=%(tag)s&linkCode=ur2&camp=1634&creative=6738' % aff_id
open_url(QUrl(store_link))
def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords='
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
# doc = html.fromstring(f.read().decode('latin-1', 'replace'))
# Apparently amazon Europe is responding in UTF-8 now
doc = html.fromstring(f.read())
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
format_xpath = './/span[@class="format"]/text()'
cover_xpath = './/img[@class="productImage"]/@src'
for data in doc.xpath(data_xpath):
if counter <= 0:
break
# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). So we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue
# We must have an asin otherwise we can't easily reference the
# book later.
asin = ''.join(data.xpath("@name"))
cover_url = ''.join(data.xpath(cover_xpath))
title = ''.join(data.xpath('.//a[@class="title"]/text()'))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span[contains(@class, "price")]/text()'))
author = unicode(''.join(data.xpath('.//h3[@class="title"]/span[@class="ptBrand"]/text()')))
if author.startswith('de '):
author = author[3:]
counter -= 1
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'
s.drm = SearchResult.DRM_UNKNOWN
yield s

View File

@ -6,78 +6,17 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from contextlib import closing from calibre.gui2.store.stores.amazon_uk_plugin import AmazonUKKindleStore
from lxml import html class AmazonITKindleStore(AmazonUKKindleStore):
from PyQt4.Qt import QUrl
from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
class AmazonITKindleStore(StorePlugin):
''' '''
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
def open(self, parent=None, detail_item=None, external=False): aff_id = {'tag': 'httpcharles07-21'}
aff_id = {'tag': 'httpcharles07-21'} store_link = ('http://www.amazon.it/ebooks-kindle/b?_encoding=UTF8&'
store_link = 'http://www.amazon.it/ebooks-kindle/b?_encoding=UTF8&node=827182031&tag=%(tag)s&ie=UTF8&linkCode=ur2&camp=3370&creative=23322' % aff_id 'node=827182031&tag=%(tag)s&ie=UTF8&linkCode=ur2&camp=3370&creative=23322')
if detail_item: store_link_details = ('http://www.amazon.it/gp/redirect.html?ie=UTF8&'
aff_id['asin'] = detail_item 'location=http://www.amazon.it/dp/%(asin)s&tag=%(tag)s&'
store_link = 'http://www.amazon.it/gp/redirect.html?ie=UTF8&location=http://www.amazon.it/dp/%(asin)s&tag=%(tag)s&linkCode=ur2&camp=3370&creative=23322' % aff_id 'linkCode=ur2&camp=3370&creative=23322')
open_url(QUrl(store_link)) search_url = 'http://www.amazon.it/s/?url=search-alias%3Ddigital-text&field-keywords='
def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.it/s/?url=search-alias%3Ddigital-text&field-keywords='
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
# doc = html.fromstring(f.read().decode('latin-1', 'replace'))
# Apparently amazon Europe is responding in UTF-8 now
doc = html.fromstring(f.read())
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
format_xpath = './/span[@class="format"]/text()'
cover_xpath = './/img[@class="productImage"]/@src'
for data in doc.xpath(data_xpath):
if counter <= 0:
break
# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). So we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue
# We must have an asin otherwise we can't easily reference the
# book later.
asin = ''.join(data.xpath("@name"))
cover_url = ''.join(data.xpath(cover_xpath))
title = ''.join(data.xpath('.//a[@class="title"]/text()'))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span[contains(@class, "price")]/text()'))
author = unicode(''.join(data.xpath('.//h3[@class="title"]/span[@class="ptBrand"]/text()')))
if author.startswith('di '):
author = author[3:]
counter -= 1
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'
s.drm = SearchResult.DRM_UNKNOWN
yield s

View File

@ -6,8 +6,9 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from contextlib import closing import re
from contextlib import closing
from lxml import html from lxml import html
from PyQt4.Qt import QUrl from PyQt4.Qt import QUrl
@ -18,57 +19,80 @@ from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
class AmazonUKKindleStore(StorePlugin): class AmazonUKKindleStore(StorePlugin):
aff_id = {'tag': 'calcharles-21'}
store_link = ('http://www.amazon.co.uk/gp/redirect.html?ie=UTF8&'
'location=http://www.amazon.co.uk/Kindle-eBooks/b?'
'ie=UTF8&node=341689031&ref_=sa_menu_kbo2&tag=%(tag)s&'
'linkCode=ur2&camp=1634&creative=19450')
store_link_details = ('http://www.amazon.co.uk/gp/redirect.html?ie=UTF8&'
'location=http://www.amazon.co.uk/dp/%(asin)s&tag=%(tag)s&'
'linkCode=ur2&camp=1634&creative=6738')
search_url = 'http://www.amazon.co.uk/s/?url=search-alias%3Ddigital-text&field-keywords='
''' '''
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
aff_id = {'tag': 'calcharles-21'}
store_link = 'http://www.amazon.co.uk/gp/redirect.html?ie=UTF8&location=http://www.amazon.co.uk/Kindle-eBooks/b?ie=UTF8&node=341689031&ref_=sa_menu_kbo2&tag=%(tag)s&linkCode=ur2&camp=1634&creative=19450' % aff_id
store_link = self.store_link % self.aff_id
if detail_item: if detail_item:
aff_id['asin'] = detail_item self.aff_id['asin'] = detail_item
store_link = 'http://www.amazon.co.uk/gp/redirect.html?ie=UTF8&location=http://www.amazon.co.uk/dp/%(asin)s&tag=%(tag)s&linkCode=ur2&camp=1634&creative=6738' % aff_id store_link = self.store_link_details % self.aff_id
open_url(QUrl(store_link)) open_url(QUrl(store_link))
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.co.uk/s/?url=search-alias%3Ddigital-text&field-keywords=' url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser() br = browser()
counter = max_results counter = max_results
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
# Apparently amazon Europe is responding in UTF-8 now doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))
doc = html.fromstring(f.read())
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' data_xpath = '//div[contains(@class, "prod")]'
format_xpath = './/span[@class="format"]/text()' format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()'
asin_xpath = './/div[@class="image"]/a[1]'
cover_xpath = './/img[@class="productImage"]/@src' cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()'
price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()'
for data in doc.xpath(data_xpath): for data in doc.xpath(data_xpath):
if counter <= 0: if counter <= 0:
break break
# Even though we are searching digital-text only Amazon will still # Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). So we need # put in results for non Kindle books (author pages). Se we need
# to explicitly check if the item is a Kindle book and ignore it # to explicitly check if the item is a Kindle book and ignore it
# if it isn't. # if it isn't.
format = ''.join(data.xpath(format_xpath)) format_ = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower(): if 'kindle' not in format_.lower():
continue continue
# We must have an asin otherwise we can't easily reference the # We must have an asin otherwise we can't easily reference the
# book later. # book later.
asin = ''.join(data.xpath("@name")) asin_href = None
asin_a = data.xpath(asin_xpath)
if asin_a:
asin_href = asin_a[0].get('href', '')
m = re.search(r'/dp/(?P<asin>.+?)(/|$)', asin_href)
if m:
asin = m.group('asin')
else:
continue
else:
continue
cover_url = ''.join(data.xpath(cover_xpath)) cover_url = ''.join(data.xpath(cover_xpath))
title = ''.join(data.xpath('.//a[@class="title"]/text()')) title = ''.join(data.xpath(title_xpath))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span[contains(@class, "price")]/text()')) author = ''.join(data.xpath(author_xpath))
try:
author = author.split('by ', 1)[1].split(" (")[0]
except:
pass
author = ''.join(data.xpath('.//h3[@class="title"]/span[@class="ptBrand"]/text()')) price = ''.join(data.xpath(price_xpath))
if author.startswith('by '):
author = author[3:]
counter -= 1 counter -= 1
@ -78,37 +102,10 @@ class AmazonUKKindleStore(StorePlugin):
s.author = author.strip() s.author = author.strip()
s.price = price.strip() s.price = price.strip()
s.detail_item = asin.strip() s.detail_item = asin.strip()
s.drm = SearchResult.DRM_UNKNOWN
s.formats = 'Kindle' s.formats = 'Kindle'
yield s yield s
def get_details(self, search_result, timeout): def get_details(self, search_result, timeout):
# We might already have been called. pass
if search_result.drm:
return
url = 'http://amazon.co.uk/dp/'
drm_search_text = u'Simultaneous Device Usage'
drm_free_text = u'Unlimited'
br = browser()
with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
if not search_result.author:
search_result.author = ''.join(idata.xpath('//div[@class="buying" and contains(., "Author")]/a/text()'))
is_kindle = idata.xpath('boolean(//div[@class="buying"]/h1/span/span[contains(text(), "Kindle Edition")])')
if is_kindle:
search_result.formats = 'Kindle'
if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' +
drm_search_text + '")])'):
if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' +
drm_free_text + '") and contains(b, "' +
drm_search_text + '")])'):
search_result.drm = SearchResult.DRM_UNLOCKED
else:
search_result.drm = SearchResult.DRM_UNKNOWN
else:
search_result.drm = SearchResult.DRM_LOCKED
return True

View File

@ -25,7 +25,7 @@ class LibreDEStore(BasicStoreConfig, StorePlugin):
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
url = 'http://ad.zanox.com/ppc/?18817073C15644254T' url = 'http://ad.zanox.com/ppc/?18817073C15644254T'
url_details = ('http://ad.zanox.com/ppc/?18817073C15644254T&ULP=[[' url_details = ('http://ad.zanox.com/ppc/?18817073C15644254T&ULP=[['
'http://www.libri.de/shop/action/productDetails?artiId={0}]]') 'http://www.ebook.de/shop/action/productDetails?artiId={0}]]')
if external or self.config.get('open_external', False): if external or self.config.get('open_external', False):
if detail_item: if detail_item:
@ -41,33 +41,38 @@ class LibreDEStore(BasicStoreConfig, StorePlugin):
d.exec_() d.exec_()
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = ('http://www.libri.de/shop/action/quickSearch?facetNodeId=6' url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString='
'&mainsearchSubmit=Los!&searchString=' + urllib2.quote(query)) + urllib2.quote(query))
br = browser() br = browser()
counter = max_results counter = max_results
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read()) doc = html.fromstring(f.read())
for data in doc.xpath('//div[contains(@class, "item")]'): for data in doc.xpath('//div[contains(@class, "articlecontainer")]'):
if counter <= 0: if counter <= 0:
break break
details = data.xpath('./div[@class="beschreibungContainer"]') details = data.xpath('./div[@class="articleinfobox"]')
if not details: if not details:
continue continue
details = details[0] details = details[0]
id = ''.join(details.xpath('./div[@class="text"]/a/@name')).strip() id_ = ''.join(details.xpath('./a/@name')).strip()
if not id: if not id_:
continue continue
cover_url = ''.join(details.xpath('.//div[@class="coverImg"]/a/img/@src')) title = ''.join(details.xpath('.//a[@class="su1_c_l_titel"]/text()')).strip()
title = ''.join(details.xpath('./div[@class="text"]/span[@class="titel"]/a/text()')).strip()
author = ''.join(details.xpath('./div[@class="text"]/span[@class="author"]/text()')).strip() author = ''.join(details.xpath('.//div[@class="author"]/text()')).strip()
if author.startswith('von'):
author = author[4:]
pdf = details.xpath( pdf = details.xpath(
'boolean(.//span[@class="format" and contains(text(), "pdf")]/text())') 'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())')
epub = details.xpath( epub = details.xpath(
'boolean(.//span[@class="format" and contains(text(), "epub")]/text())') 'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())')
mobi = details.xpath( mobi = details.xpath(
'boolean(.//span[@class="format" and contains(text(), "mobipocket")]/text())') 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())')
cover_url = ''.join(data.xpath('.//div[@class="coverImg"]/a/img/@src'))
price = ''.join(data.xpath('.//span[@class="preis"]/text()')).replace('*', '').strip() price = ''.join(data.xpath('.//span[@class="preis"]/text()')).replace('*', '').strip()
counter -= 1 counter -= 1
@ -78,7 +83,7 @@ class LibreDEStore(BasicStoreConfig, StorePlugin):
s.author = author.strip() s.author = author.strip()
s.price = price s.price = price
s.drm = SearchResult.DRM_UNKNOWN s.drm = SearchResult.DRM_UNKNOWN
s.detail_item = id s.detail_item = id_
formats = [] formats = []
if epub: if epub:
formats.append('ePub') formats.append('ePub')

View File

@ -41,6 +41,13 @@ class EPUB_MOBI(CatalogPlugin):
help = _('Title of generated catalog used as title in metadata.\n' help = _('Title of generated catalog used as title in metadata.\n'
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--cross-reference-authors',
default=False,
dest='cross_reference_authors',
action = 'store_true',
help=_("Create cross-references in Authors section for books with multiple authors.\n"
"Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")),
Option('--debug-pipeline', Option('--debug-pipeline',
default=None, default=None,
dest='debug_pipeline', dest='debug_pipeline',
@ -58,7 +65,6 @@ class EPUB_MOBI(CatalogPlugin):
help=_("Regex describing tags to exclude as genres.\n" help=_("Regex describing tags to exclude as genres.\n"
"Default: '%default' excludes bracketed tags, e.g. '[Project Gutenberg]', and '+', the default tag for read books.\n" "Default: '%default' excludes bracketed tags, e.g. '[Project Gutenberg]', and '+', the default tag for read books.\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--exclusion-rules', Option('--exclusion-rules',
default="(('Catalogs','Tags','Catalog'),)", default="(('Catalogs','Tags','Catalog'),)",
dest='exclusion_rules', dest='exclusion_rules',
@ -72,7 +78,6 @@ class EPUB_MOBI(CatalogPlugin):
"When multiple rules are defined, all rules will be applied.\n" "When multiple rules are defined, all rules will be applied.\n"
"Default: \n" + '"' + '%default' + '"' + "\n" "Default: \n" + '"' + '%default' + '"' + "\n"
"Applies to AZW3, ePub, MOBI output formats")), "Applies to AZW3, ePub, MOBI output formats")),
Option('--generate-authors', Option('--generate-authors',
default=False, default=False,
dest='generate_authors', dest='generate_authors',
@ -318,8 +323,8 @@ class EPUB_MOBI(CatalogPlugin):
build_log.append(" opts:") build_log.append(" opts:")
for key in keys: for key in keys:
if key in ['catalog_title','author_clip','connected_kindle','creator', if key in ['catalog_title','author_clip','connected_kindle','creator',
'description_clip','exclude_book_marker','exclude_genre', 'cross_reference_authors','description_clip','exclude_book_marker',
'exclude_tags','exclusion_rules', 'fmt', 'exclude_genre','exclude_tags','exclusion_rules', 'fmt',
'header_note_source_field','merge_comments_rule', 'header_note_source_field','merge_comments_rule',
'output_profile','prefix_rules','read_book_marker', 'output_profile','prefix_rules','read_book_marker',
'search_text','sort_by','sort_descriptions_by_author','sync', 'search_text','sort_by','sort_descriptions_by_author','sync',

View File

@ -14,11 +14,12 @@ from calibre.customize.conversion import DummyReporter
from calibre.customize.ui import output_profiles from calibre.customize.ui import output_profiles
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag, NavigableString from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag, NavigableString
from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.metadata import author_to_author_sort
from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.config import config_dir from calibre.utils.config import config_dir
from calibre.utils.date import format_date, is_date_undefined, now as nowf from calibre.utils.date import format_date, is_date_undefined, now as nowf
from calibre.utils.filenames import ascii_text from calibre.utils.filenames import ascii_text, shorten_components_to
from calibre.utils.icu import capitalize, collation_order, sort_key from calibre.utils.icu import capitalize, collation_order, sort_key
from calibre.utils.magick.draw import thumbnail from calibre.utils.magick.draw import thumbnail
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -109,6 +110,7 @@ class CatalogBuilder(object):
self.stylesheet = stylesheet self.stylesheet = stylesheet
self.cache_dir = os.path.join(config_dir, 'caches', 'catalog') self.cache_dir = os.path.join(config_dir, 'caches', 'catalog')
self.catalog_path = PersistentTemporaryDirectory("_epub_mobi_catalog", prefix='') self.catalog_path = PersistentTemporaryDirectory("_epub_mobi_catalog", prefix='')
self.content_dir = os.path.join(self.catalog_path, "content")
self.excluded_tags = self.get_excluded_tags() self.excluded_tags = self.get_excluded_tags()
self.generate_for_kindle_azw3 = True if (_opts.fmt == 'azw3' and self.generate_for_kindle_azw3 = True if (_opts.fmt == 'azw3' and
_opts.output_profile and _opts.output_profile and
@ -127,12 +129,13 @@ class CatalogBuilder(object):
self.books_by_title = None self.books_by_title = None
self.books_by_title_no_series_prefix = None self.books_by_title_no_series_prefix = None
self.books_to_catalog = None self.books_to_catalog = None
self.content_dir = os.path.join(self.catalog_path, "content")
self.current_step = 0.0 self.current_step = 0.0
self.error = [] self.error = []
self.generate_recently_read = False self.generate_recently_read = False
self.genres = [] self.genres = []
self.genre_tags_dict = None self.genre_tags_dict = \
self.filter_db_tags(max_len = 245 - len("%s/Genre_.html" % self.content_dir)) \
if self.opts.generate_genres else None
self.html_filelist_1 = [] self.html_filelist_1 = []
self.html_filelist_2 = [] self.html_filelist_2 = []
self.merge_comments_rule = dict(zip(['field','position','hr'], self.merge_comments_rule = dict(zip(['field','position','hr'],
@ -505,7 +508,7 @@ class CatalogBuilder(object):
if not os.path.isdir(images_path): if not os.path.isdir(images_path):
os.makedirs(images_path) os.makedirs(images_path)
def detect_author_sort_mismatches(self): def detect_author_sort_mismatches(self, books_to_test):
""" Detect author_sort mismatches. """ Detect author_sort mismatches.
Sort by author, look for inconsistencies in author_sort among Sort by author, look for inconsistencies in author_sort among
@ -513,17 +516,18 @@ class CatalogBuilder(object):
annoyance for EPUB. annoyance for EPUB.
Inputs: Inputs:
self.books_to_catalog (list): list of books to catalog books_by_author (list): list of books to test, possibly unsorted
Output: Output:
self.books_by_author (list): sorted by author (none)
Exceptions: Exceptions:
AuthorSortMismatchException: author_sort mismatch detected AuthorSortMismatchException: author_sort mismatch detected
""" """
self.books_by_author = sorted(list(self.books_to_catalog), key=self._kf_books_by_author_sorter_author) books_by_author = sorted(list(books_to_test), key=self._kf_books_by_author_sorter_author)
authors = [(record['author'], record['author_sort']) for record in self.books_by_author]
authors = [(record['author'], record['author_sort']) for record in books_by_author]
current_author = authors[0] current_author = authors[0]
for (i,author) in enumerate(authors): for (i,author) in enumerate(authors):
if author != current_author and i: if author != current_author and i:
@ -701,6 +705,7 @@ class CatalogBuilder(object):
def fetch_books_by_author(self): def fetch_books_by_author(self):
""" Generate a list of books sorted by author. """ Generate a list of books sorted by author.
For books with multiple authors, relist book with additional authors.
Sort the database by author. Report author_sort inconsistencies as warning when Sort the database by author. Report author_sort inconsistencies as warning when
building EPUB or MOBI, error when building MOBI. Collect a list of unique authors building EPUB or MOBI, error when building MOBI. Collect a list of unique authors
to self.authors. to self.authors.
@ -720,25 +725,30 @@ class CatalogBuilder(object):
self.update_progress_full_step(_("Sorting database")) self.update_progress_full_step(_("Sorting database"))
self.detect_author_sort_mismatches() books_by_author = list(self.books_to_catalog)
self.detect_author_sort_mismatches(books_by_author)
if self.opts.cross_reference_authors:
books_by_author = self.relist_multiple_authors(books_by_author)
#books_by_author = sorted(list(books_by_author), key=self._kf_books_by_author_sorter_author)
# Sort authors using sort_key to normalize accented letters
# Determine the longest author_sort length before sorting # Determine the longest author_sort length before sorting
asl = [i['author_sort'] for i in self.books_by_author] asl = [i['author_sort'] for i in books_by_author]
las = max(asl, key=len) las = max(asl, key=len)
self.books_by_author = sorted(self.books_to_catalog,
books_by_author = sorted(books_by_author,
key=lambda x: sort_key(self._kf_books_by_author_sorter_author_sort(x, len(las)))) key=lambda x: sort_key(self._kf_books_by_author_sorter_author_sort(x, len(las))))
if self.DEBUG and self.opts.verbose: if self.DEBUG and self.opts.verbose:
tl = [i['title'] for i in self.books_by_author] tl = [i['title'] for i in books_by_author]
lt = max(tl, key=len) lt = max(tl, key=len)
fs = '{:<6}{:<%d} {:<%d} {!s}' % (len(lt),len(las)) fs = '{:<6}{:<%d} {:<%d} {!s}' % (len(lt),len(las))
print(fs.format('','Title','Author','Series')) print(fs.format('','Title','Author','Series'))
for i in self.books_by_author: for i in books_by_author:
print(fs.format('', i['title'],i['author_sort'],i['series'])) print(fs.format('', i['title'],i['author_sort'],i['series']))
# Build the unique_authors set from existing data # Build the unique_authors set from existing data
authors = [(record['author'], capitalize(record['author_sort'])) for record in self.books_by_author] authors = [(record['author'], capitalize(record['author_sort'])) for record in books_by_author]
# authors[] contains a list of all book authors, with multiple entries for multiple books by author # authors[] contains a list of all book authors, with multiple entries for multiple books by author
# authors[]: (([0]:friendly [1]:sort)) # authors[]: (([0]:friendly [1]:sort))
@ -776,6 +786,7 @@ class CatalogBuilder(object):
author[2])).encode('utf-8')) author[2])).encode('utf-8'))
self.authors = unique_authors self.authors = unique_authors
self.books_by_author = books_by_author
return True return True
def fetch_books_by_title(self): def fetch_books_by_title(self):
@ -863,15 +874,15 @@ class CatalogBuilder(object):
this_title['series_index'] = 0.0 this_title['series_index'] = 0.0
this_title['title_sort'] = self.generate_sort_title(this_title['title']) this_title['title_sort'] = self.generate_sort_title(this_title['title'])
if 'authors' in record:
# from calibre.ebooks.metadata import authors_to_string
# return authors_to_string(self.authors)
if 'authors' in record:
this_title['authors'] = record['authors'] this_title['authors'] = record['authors']
# Synthesize author attribution from authors list
if record['authors']: if record['authors']:
this_title['author'] = " &amp; ".join(record['authors']) this_title['author'] = " &amp; ".join(record['authors'])
else: else:
this_title['author'] = 'Unknown' this_title['author'] = _('Unknown')
this_title['authors'] = [this_title['author']]
if 'author_sort' in record and record['author_sort'].strip(): if 'author_sort' in record and record['author_sort'].strip():
this_title['author_sort'] = record['author_sort'] this_title['author_sort'] = record['author_sort']
@ -1093,7 +1104,7 @@ class CatalogBuilder(object):
self.bookmarked_books = bookmarks self.bookmarked_books = bookmarks
def filter_db_tags(self): def filter_db_tags(self, max_len):
""" Remove excluded tags from data set, return normalized genre list. """ Remove excluded tags from data set, return normalized genre list.
Filter all db tags, removing excluded tags supplied in opts. Filter all db tags, removing excluded tags supplied in opts.
@ -1101,13 +1112,13 @@ class CatalogBuilder(object):
tags are flattened to alphanumeric ascii_text. tags are flattened to alphanumeric ascii_text.
Args: Args:
(none) max_len: maximum length of normalized tag to fit within OS constraints
Return: Return:
genre_tags_dict (dict): dict of filtered, normalized tags in data set genre_tags_dict (dict): dict of filtered, normalized tags in data set
""" """
def _format_tag_list(tags, indent=2, line_break=70, header='Tag list'): def _format_tag_list(tags, indent=1, line_break=70, header='Tag list'):
def _next_tag(sorted_tags): def _next_tag(sorted_tags):
for (i, tag) in enumerate(sorted_tags): for (i, tag) in enumerate(sorted_tags):
if i < len(tags) - 1: if i < len(tags) - 1:
@ -1126,6 +1137,31 @@ class CatalogBuilder(object):
out_str = ' ' * (indent + 1) out_str = ' ' * (indent + 1)
return ans + out_str return ans + out_str
def _normalize_tag(tag, max_len):
""" Generate an XHTML-legal anchor string from tag.
Parse tag for non-ascii, convert to unicode name.
Args:
tags (str): tag name possible containing symbols
max_len (int): maximum length of tag
Return:
normalized (str): unicode names substituted for non-ascii chars,
clipped to max_len
"""
normalized = massaged = re.sub('\s','',ascii_text(tag).lower())
if re.search('\W',normalized):
normalized = ''
for c in massaged:
if re.search('\W',c):
normalized += self.generate_unicode_name(c)
else:
normalized += c
shortened = shorten_components_to(max_len, [normalized])[0]
return shortened
# Entry point # Entry point
normalized_tags = [] normalized_tags = []
friendly_tags = [] friendly_tags = []
@ -1144,7 +1180,7 @@ class CatalogBuilder(object):
if tag == ' ': if tag == ' ':
continue continue
normalized_tags.append(self.normalize_tag(tag)) normalized_tags.append(_normalize_tag(tag, max_len))
friendly_tags.append(tag) friendly_tags.append(tag)
genre_tags_dict = dict(zip(friendly_tags,normalized_tags)) genre_tags_dict = dict(zip(friendly_tags,normalized_tags))
@ -1941,8 +1977,6 @@ class CatalogBuilder(object):
self.update_progress_full_step(_("Genres HTML")) self.update_progress_full_step(_("Genres HTML"))
self.genre_tags_dict = self.filter_db_tags()
# Extract books matching filtered_tags # Extract books matching filtered_tags
genre_list = [] genre_list = []
for friendly_tag in sorted(self.genre_tags_dict, key=sort_key): for friendly_tag in sorted(self.genre_tags_dict, key=sort_key):
@ -2024,10 +2058,11 @@ class CatalogBuilder(object):
books_by_current_author += 1 books_by_current_author += 1
# Write the genre book list as an article # Write the genre book list as an article
titles_spanned = self.generate_html_by_genre(genre, True if index==0 else False, outfile = "%s/Genre_%s.html" % (self.content_dir, genre)
genre_tag_set[genre], titles_spanned = self.generate_html_by_genre(genre,
"%s/Genre_%s.html" % (self.content_dir, True if index==0 else False,
genre)) genre_tag_set[genre],
outfile)
tag_file = "content/Genre_%s.html" % genre tag_file = "content/Genre_%s.html" % genre
master_genre_list.append({'tag':genre, master_genre_list.append({'tag':genre,
@ -2549,7 +2584,7 @@ class CatalogBuilder(object):
for (i, tag) in enumerate(sorted(book.get('tags', []))): for (i, tag) in enumerate(sorted(book.get('tags', []))):
aTag = Tag(_soup,'a') aTag = Tag(_soup,'a')
if self.opts.generate_genres: if self.opts.generate_genres:
aTag['href'] = "Genre_%s.html" % self.normalize_tag(tag) aTag['href'] = "Genre_%s.html" % self.genre_tags_dict[tag]
aTag.insert(0,escape(NavigableString(tag))) aTag.insert(0,escape(NavigableString(tag)))
genresTag.insert(gtc, aTag) genresTag.insert(gtc, aTag)
gtc += 1 gtc += 1
@ -4603,28 +4638,6 @@ class CatalogBuilder(object):
return merged return merged
def normalize_tag(self, tag):
""" Generate an XHTML-legal anchor string from tag.
Parse tag for non-ascii, convert to unicode name.
Args:
tags (str): tag name possible containing symbols
Return:
normalized (str): unicode names substituted for non-ascii chars
"""
normalized = massaged = re.sub('\s','',ascii_text(tag).lower())
if re.search('\W',normalized):
normalized = ''
for c in massaged:
if re.search('\W',c):
normalized += self.generate_unicode_name(c)
else:
normalized += c
return normalized
def process_exclusions(self, data_set): def process_exclusions(self, data_set):
""" Filter data_set based on exclusion_rules. """ Filter data_set based on exclusion_rules.
@ -4697,6 +4710,43 @@ class CatalogBuilder(object):
else: else:
return data_set return data_set
def relist_multiple_authors(self, books_by_author):
""" Create multiple entries for books with multiple authors
Given a list of books by author, scan list for books with multiple
authors. Add a cloned copy of the book per additional author.
Args:
books_by_author (list): book list possibly containing books
with multiple authors
Return:
(list): books_by_author with additional cloned entries for books with
multiple authors
"""
multiple_author_books = []
# Find the multiple author books
for book in books_by_author:
if len(book['authors']) > 1:
multiple_author_books.append(book)
for book in multiple_author_books:
cloned_authors = list(book['authors'])
for x, author in enumerate(book['authors']):
if x:
first_author = cloned_authors.pop(0)
cloned_authors.append(first_author)
new_book = deepcopy(book)
new_book['author'] = ' & '.join(cloned_authors)
new_book['authors'] = list(cloned_authors)
asl = [author_to_author_sort(auth) for auth in cloned_authors]
new_book['author_sort'] = ' & '.join(asl)
books_by_author.append(new_book)
return books_by_author
def update_progress_full_step(self, description): def update_progress_full_step(self, description):
""" Update calibre's job status UI. """ Update calibre's job status UI.

View File

@ -0,0 +1,153 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, unpack
from calibre.utils.fonts.sfnt import UnknownTable
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
# Useful links
# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5176.CFF.pdf
# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5177.Type2.pdf
class CFF(object):
def __init__(self, raw):
(self.major_version, self.minor_version, self.header_size,
self.offset_size) = unpack_from(b'>4B', raw)
if (self.major_version, self.minor_version) != (1, 0):
raise UnsupportedFont('The CFF table has unknown version: '
'(%d, %d)'%(self.major_version, self.minor_version))
offset = self.header_size
# Read Names Index
self.font_names = Index(raw, offset)
offset = self.font_names.pos
if len(self.font_names) > 1:
raise UnsupportedFont('CFF table has more than one font.')
# Read Top Dict
self.top_index = Index(raw, offset)
offset = self.top_index.pos
# Read strings
self.strings = Strings(raw, offset)
offset = self.strings.pos
print (self.strings[len(cff_standard_strings):])
class Index(list):
def __init__(self, raw, offset):
list.__init__(self)
count = unpack_from(b'>H', raw, offset)[0]
offset += 2
self.pos = offset
if count > 0:
self.offset_size = unpack_from(b'>B', raw, offset)[0]
offset += 1
if self.offset_size == 3:
offsets = [unpack(b'>L', b'\0' + raw[i:i+3])[0]
for i in xrange(offset, 3*(count+2), 3)]
else:
fmt = {1:'B', 2:'H', 4:'L'}.get(self.offset_size)
fmt = ('>%d%s'%(count+1, fmt)).encode('ascii')
offsets = unpack_from(fmt, raw, offset)
offset += self.offset_size * (count+1) - 1
for i in xrange(len(offsets)-1):
off, noff = offsets[i:i+2]
obj = raw[offset+i:offset+noff]
self.append(obj)
self.pos = offset + offsets[-1]
class Strings(Index):
def __init__(self, raw, offset):
super(Strings, self).__init__(raw, offset)
for x in reversed(cff_standard_strings):
self.insert(0, x)
class CFFTable(UnknownTable):
def decompile(self):
self.cff = CFF(self.raw)
# cff_standard_strings {{{
# The 391 Standard Strings as used in the CFF format.
# from Adobe Technical None #5176, version 1.0, 18 March 1998
cff_standard_strings = [
'.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', 'dollar', 'percent',
'ampersand', 'quoteright', 'parenleft', 'parenright', 'asterisk', 'plus',
'comma', 'hyphen', 'period', 'slash', 'zero', 'one', 'two', 'three', 'four',
'five', 'six', 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'bracketleft', 'backslash', 'bracketright', 'asciicircum', 'underscore',
'quoteleft', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'braceleft',
'bar', 'braceright', 'asciitilde', 'exclamdown', 'cent', 'sterling',
'fraction', 'yen', 'florin', 'section', 'currency', 'quotesingle',
'quotedblleft', 'guillemotleft', 'guilsinglleft', 'guilsinglright', 'fi', 'fl',
'endash', 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
'quotesinglbase', 'quotedblbase', 'quotedblright', 'guillemotright',
'ellipsis', 'perthousand', 'questiondown', 'grave', 'acute', 'circumflex',
'tilde', 'macron', 'breve', 'dotaccent', 'dieresis', 'ring', 'cedilla',
'hungarumlaut', 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', 'oslash', 'oe',
'germandbls', 'onesuperior', 'logicalnot', 'mu', 'trademark', 'Eth', 'onehalf',
'plusminus', 'Thorn', 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
'threequarters', 'twosuperior', 'registered', 'minus', 'eth', 'multiply',
'threesuperior', 'copyright', 'Aacute', 'Acircumflex', 'Adieresis', 'Agrave',
'Aring', 'Atilde', 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', 'Oacute',
'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', 'Scaron', 'Uacute',
'Ucircumflex', 'Udieresis', 'Ugrave', 'Yacute', 'Ydieresis', 'Zcaron',
'aacute', 'acircumflex', 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla',
'eacute', 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', 'odieresis',
'ograve', 'otilde', 'scaron', 'uacute', 'ucircumflex', 'udieresis', 'ugrave',
'yacute', 'ydieresis', 'zcaron', 'exclamsmall', 'Hungarumlautsmall',
'dollaroldstyle', 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', 'onedotenleader',
'zerooldstyle', 'oneoldstyle', 'twooldstyle', 'threeoldstyle', 'fouroldstyle',
'fiveoldstyle', 'sixoldstyle', 'sevenoldstyle', 'eightoldstyle',
'nineoldstyle', 'commasuperior', 'threequartersemdash', 'periodsuperior',
'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', 'dsuperior',
'esuperior', 'isuperior', 'lsuperior', 'msuperior', 'nsuperior', 'osuperior',
'rsuperior', 'ssuperior', 'tsuperior', 'ff', 'ffi', 'ffl', 'parenleftinferior',
'parenrightinferior', 'Circumflexsmall', 'hyphensuperior', 'Gravesmall',
'Asmall', 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', 'Hsmall',
'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', 'Nsmall', 'Osmall', 'Psmall',
'Qsmall', 'Rsmall', 'Ssmall', 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall',
'Ysmall', 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', 'Zcaronsmall',
'Dieresissmall', 'Brevesmall', 'Caronsmall', 'Dotaccentsmall', 'Macronsmall',
'figuredash', 'hypheninferior', 'Ogoneksmall', 'Ringsmall', 'Cedillasmall',
'questiondownsmall', 'oneeighth', 'threeeighths', 'fiveeighths',
'seveneighths', 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', 'threeinferior',
'fourinferior', 'fiveinferior', 'sixinferior', 'seveninferior',
'eightinferior', 'nineinferior', 'centinferior', 'dollarinferior',
'periodinferior', 'commainferior', 'Agravesmall', 'Aacutesmall',
'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', 'Aringsmall', 'AEsmall',
'Ccedillasmall', 'Egravesmall', 'Eacutesmall', 'Ecircumflexsmall',
'Edieresissmall', 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', 'Oacutesmall',
'Ocircumflexsmall', 'Otildesmall', 'Odieresissmall', 'OEsmall', 'Oslashsmall',
'Ugravesmall', 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', '001.001', '001.002',
'001.003', 'Black', 'Bold', 'Book', 'Light', 'Medium', 'Regular', 'Roman',
'Semibold'
]
# }}}

View File

@ -0,0 +1,11 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,201 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack
t1_operand_encoding = [None] * 256
t1_operand_encoding[0:32] = (32) * ["do_operator"]
t1_operand_encoding[32:247] = (247 - 32) * ["read_byte"]
t1_operand_encoding[247:251] = (251 - 247) * ["read_small_int1"]
t1_operand_encoding[251:255] = (255 - 251) * ["read_small_int2"]
t1_operand_encoding[255] = "read_long_int"
t2_operand_encoding = t1_operand_encoding[:]
t2_operand_encoding[28] = "read_short_int"
t2_operand_encoding[255] = "read_fixed_1616"
cff_dict_operand_encoding = t2_operand_encoding[:]
cff_dict_operand_encoding[29] = "read_long_int"
cff_dict_operand_encoding[30] = "read_real_number"
cff_dict_operand_encoding[255] = "reserved"
real_nibbles = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'.', 'E', 'E-', None, '-']
class SimpleConverter(object):
def read(self, parent, value):
return value
def write(self, parent, value):
return value
class TODO(SimpleConverter):
pass
class Reader(dict):
def read_byte(self, b0, data, index):
return b0 - 139, index
def read_small_int1(self, b0, data, index):
b1 = ord(data[index])
return (b0-247)*256 + b1 + 108, index+1
def read_small_int2(self, b0, data, index):
b1 = ord(data[index])
return -(b0-251)*256 - b1 - 108, index+1
def read_short_int(self, b0, data, index):
bin = data[index] + data[index+1]
value, = unpack(b">h", bin)
return value, index+2
def read_long_int(self, b0, data, index):
bin = data[index] + data[index+1] + data[index+2] + data[index+3]
value, = unpack(b">l", bin)
return value, index+4
def read_fixed_1616(self, b0, data, index):
bin = data[index] + data[index+1] + data[index+2] + data[index+3]
value, = unpack(b">l", bin)
return value / 65536.0, index+4
def read_real_number(self, b0, data, index):
number = ''
while True:
b = ord(data[index])
index = index + 1
nibble0 = (b & 0xf0) >> 4
nibble1 = b & 0x0f
if nibble0 == 0xf:
break
number = number + real_nibbles[nibble0]
if nibble1 == 0xf:
break
number = number + real_nibbles[nibble1]
return float(number), index
class Dict(Reader):
operand_encoding = cff_dict_operand_encoding
TABLE = []
def __init__(self):
Reader.__init__(self)
table = self.TABLE[:]
for i in xrange(len(table)):
op, name, arg, default, conv = table[i]
if conv is not None:
continue
if arg in ("delta", "array", 'number', 'SID'):
conv = SimpleConverter()
else:
raise Exception('Should not happen')
table[i] = op, name, arg, default, conv
self.operators = {op:(name, arg) for op, name, arg, default, conv in
table}
def decompile(self, strings, global_subrs, data):
self.strings = strings
self.global_subrs = global_subrs
self.stack = []
index = 0
while index < len(data):
b0 = ord(data[index])
index += 1
handler = getattr(self, self.operand_encoding[b0])
value, index = handler(b0, data, index)
if value is not None:
self.stack.append(value)
def do_operator(self, b0, data, index):
if b0 == 12:
op = (b0, ord(data[index]))
index += 1
else:
op = b0
operator, arg_type = self.operators[op]
self.handle_operator(operator, arg_type)
return None, index
def handle_operator(self, operator, arg_type):
if isinstance(arg_type, tuple):
value = ()
for i in xrange(len(arg_type)-1, -1, -1):
arg = arg_type[i]
arghandler = getattr(self, 'arg_' + arg)
value = (arghandler(operator),) + value
else:
arghandler = getattr(self, 'arg_' + arg_type)
value = arghandler(operator)
self[operator] = value
def arg_number(self, name):
return self.stack.pop()
def arg_SID(self, name):
return self.strings[self.stack.pop()]
def arg_array(self, name):
ans = self.stack[:]
del self.stack[:]
return ans
def arg_delta(self, name):
out = []
current = 0
for v in self.stack:
current = current + v
out.append(current)
del self.stack[:]
return out
class TopDict(Dict):
TABLE = [
#opcode name argument type default converter
((12, 30), 'ROS', ('SID','SID','number'), None, SimpleConverter()),
((12, 20), 'SyntheticBase', 'number', None, None),
(0, 'version', 'SID', None, None),
(1, 'Notice', 'SID', None, None),
((12, 0), 'Copyright', 'SID', None, None),
(2, 'FullName', 'SID', None, None),
((12, 38), 'FontName', 'SID', None, None),
(3, 'FamilyName', 'SID', None, None),
(4, 'Weight', 'SID', None, None),
((12, 1), 'isFixedPitch', 'number', 0, None),
((12, 2), 'ItalicAngle', 'number', 0, None),
((12, 3), 'UnderlinePosition', 'number', None, None),
((12, 4), 'UnderlineThickness', 'number', 50, None),
((12, 5), 'PaintType', 'number', 0, None),
((12, 6), 'CharstringType', 'number', 2, None),
((12, 7), 'FontMatrix', 'array', [0.001,0,0,0.001,0,0], None),
(13, 'UniqueID', 'number', None, None),
(5, 'FontBBox', 'array', [0,0,0,0], None),
((12, 8), 'StrokeWidth', 'number', 0, None),
(14, 'XUID', 'array', None, None),
((12, 21), 'PostScript', 'SID', None, None),
((12, 22), 'BaseFontName', 'SID', None, None),
((12, 23), 'BaseFontBlend', 'delta', None, None),
((12, 31), 'CIDFontVersion', 'number', 0, None),
((12, 32), 'CIDFontRevision', 'number', 0, None),
((12, 33), 'CIDFontType', 'number', 0, None),
((12, 34), 'CIDCount', 'number', 8720, None),
(15, 'charset', 'number', 0, TODO()),
((12, 35), 'UIDBase', 'number', None, None),
(16, 'Encoding', 'number', 0, TODO()),
(18, 'Private', ('number','number'), None, TODO()),
((12, 37), 'FDSelect', 'number', None, TODO()),
((12, 36), 'FDArray', 'number', None, TODO()),
(17, 'CharStrings', 'number', None, TODO()),
]

View File

@ -0,0 +1,166 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from struct import unpack_from, unpack
from calibre.utils.fonts.sfnt import UnknownTable
from calibre.utils.fonts.sfnt.errors import UnsupportedFont
from calibre.utils.fonts.sfnt.cff.dict_data import TopDict
# Useful links
# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5176.CFF.pdf
# http://www.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/5177.Type2.pdf
class CFF(object):
def __init__(self, raw):
(self.major_version, self.minor_version, self.header_size,
self.offset_size) = unpack_from(b'>4B', raw)
if (self.major_version, self.minor_version) != (1, 0):
raise UnsupportedFont('The CFF table has unknown version: '
'(%d, %d)'%(self.major_version, self.minor_version))
offset = self.header_size
# Read Names Index
self.font_names = Index(raw, offset)
offset = self.font_names.pos
if len(self.font_names) > 1:
raise UnsupportedFont('CFF table has more than one font.')
# Read Top Dict
self.top_index = Index(raw, offset)
self.top_dict = TopDict()
offset = self.top_index.pos
# Read strings
self.strings = Strings(raw, offset)
offset = self.strings.pos
# Read global subroutines
self.global_subrs = GlobalSubrs(raw, offset)
offset = self.global_subrs.pos
# Decompile Top Dict
self.top_dict.decompile(self.strings, self.global_subrs, self.top_index[0])
import pprint
pprint.pprint(self.top_dict)
class Index(list):
def __init__(self, raw, offset, prepend=()):
list.__init__(self)
self.extend(prepend)
count = unpack_from(b'>H', raw, offset)[0]
offset += 2
self.pos = offset
if count > 0:
self.offset_size = unpack_from(b'>B', raw, offset)[0]
offset += 1
if self.offset_size == 3:
offsets = [unpack(b'>L', b'\0' + raw[i:i+3])[0]
for i in xrange(offset, 3*(count+2), 3)]
else:
fmt = {1:'B', 2:'H', 4:'L'}.get(self.offset_size)
fmt = ('>%d%s'%(count+1, fmt)).encode('ascii')
offsets = unpack_from(fmt, raw, offset)
offset += self.offset_size * (count+1) - 1
for i in xrange(len(offsets)-1):
off, noff = offsets[i:i+2]
obj = raw[offset+off:offset+noff]
self.append(obj)
self.pos = offset + offsets[-1]
class Strings(Index):
def __init__(self, raw, offset):
super(Strings, self).__init__(raw, offset, prepend=cff_standard_strings)
class GlobalSubrs(Index):
pass
class CFFTable(UnknownTable):
def decompile(self):
self.cff = CFF(self.raw)
# cff_standard_strings {{{
# The 391 Standard Strings as used in the CFF format.
# from Adobe Technical None #5176, version 1.0, 18 March 1998
cff_standard_strings = [
'.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', 'dollar', 'percent',
'ampersand', 'quoteright', 'parenleft', 'parenright', 'asterisk', 'plus',
'comma', 'hyphen', 'period', 'slash', 'zero', 'one', 'two', 'three', 'four',
'five', 'six', 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'bracketleft', 'backslash', 'bracketright', 'asciicircum', 'underscore',
'quoteleft', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'braceleft',
'bar', 'braceright', 'asciitilde', 'exclamdown', 'cent', 'sterling',
'fraction', 'yen', 'florin', 'section', 'currency', 'quotesingle',
'quotedblleft', 'guillemotleft', 'guilsinglleft', 'guilsinglright', 'fi', 'fl',
'endash', 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
'quotesinglbase', 'quotedblbase', 'quotedblright', 'guillemotright',
'ellipsis', 'perthousand', 'questiondown', 'grave', 'acute', 'circumflex',
'tilde', 'macron', 'breve', 'dotaccent', 'dieresis', 'ring', 'cedilla',
'hungarumlaut', 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', 'oslash', 'oe',
'germandbls', 'onesuperior', 'logicalnot', 'mu', 'trademark', 'Eth', 'onehalf',
'plusminus', 'Thorn', 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
'threequarters', 'twosuperior', 'registered', 'minus', 'eth', 'multiply',
'threesuperior', 'copyright', 'Aacute', 'Acircumflex', 'Adieresis', 'Agrave',
'Aring', 'Atilde', 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', 'Oacute',
'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', 'Scaron', 'Uacute',
'Ucircumflex', 'Udieresis', 'Ugrave', 'Yacute', 'Ydieresis', 'Zcaron',
'aacute', 'acircumflex', 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla',
'eacute', 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', 'odieresis',
'ograve', 'otilde', 'scaron', 'uacute', 'ucircumflex', 'udieresis', 'ugrave',
'yacute', 'ydieresis', 'zcaron', 'exclamsmall', 'Hungarumlautsmall',
'dollaroldstyle', 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', 'onedotenleader',
'zerooldstyle', 'oneoldstyle', 'twooldstyle', 'threeoldstyle', 'fouroldstyle',
'fiveoldstyle', 'sixoldstyle', 'sevenoldstyle', 'eightoldstyle',
'nineoldstyle', 'commasuperior', 'threequartersemdash', 'periodsuperior',
'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', 'dsuperior',
'esuperior', 'isuperior', 'lsuperior', 'msuperior', 'nsuperior', 'osuperior',
'rsuperior', 'ssuperior', 'tsuperior', 'ff', 'ffi', 'ffl', 'parenleftinferior',
'parenrightinferior', 'Circumflexsmall', 'hyphensuperior', 'Gravesmall',
'Asmall', 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', 'Hsmall',
'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', 'Nsmall', 'Osmall', 'Psmall',
'Qsmall', 'Rsmall', 'Ssmall', 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall',
'Ysmall', 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', 'Zcaronsmall',
'Dieresissmall', 'Brevesmall', 'Caronsmall', 'Dotaccentsmall', 'Macronsmall',
'figuredash', 'hypheninferior', 'Ogoneksmall', 'Ringsmall', 'Cedillasmall',
'questiondownsmall', 'oneeighth', 'threeeighths', 'fiveeighths',
'seveneighths', 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', 'threeinferior',
'fourinferior', 'fiveinferior', 'sixinferior', 'seveninferior',
'eightinferior', 'nineinferior', 'centinferior', 'dollarinferior',
'periodinferior', 'commainferior', 'Agravesmall', 'Aacutesmall',
'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', 'Aringsmall', 'AEsmall',
'Ccedillasmall', 'Egravesmall', 'Eacutesmall', 'Ecircumflexsmall',
'Edieresissmall', 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', 'Oacutesmall',
'Ocircumflexsmall', 'Otildesmall', 'Odieresissmall', 'OEsmall', 'Oslashsmall',
'Ugravesmall', 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', '001.001', '001.002',
'001.003', 'Black', 'Bold', 'Book', 'Light', 'Medium', 'Regular', 'Roman',
'Semibold'
]
# }}}

View File

@ -21,6 +21,7 @@ from calibre.utils.fonts.sfnt.maxp import MaxpTable
from calibre.utils.fonts.sfnt.loca import LocaTable from calibre.utils.fonts.sfnt.loca import LocaTable
from calibre.utils.fonts.sfnt.glyf import GlyfTable from calibre.utils.fonts.sfnt.glyf import GlyfTable
from calibre.utils.fonts.sfnt.cmap import CmapTable from calibre.utils.fonts.sfnt.cmap import CmapTable
from calibre.utils.fonts.sfnt.cff.table import CFFTable
# OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm # OpenType spec: http://www.microsoft.com/typography/otspec/otff.htm
@ -42,6 +43,7 @@ class Sfnt(object):
b'loca' : LocaTable, b'loca' : LocaTable,
b'glyf' : GlyfTable, b'glyf' : GlyfTable,
b'cmap' : CmapTable, b'cmap' : CmapTable,
b'CFF ' : CFFTable,
}.get(table_tag, UnknownTable)(table) }.get(table_tag, UnknownTable)(table)
def __getitem__(self, key): def __getitem__(self, key):
@ -53,12 +55,24 @@ class Sfnt(object):
def __delitem__(self, key): def __delitem__(self, key):
del self.tables[key] del self.tables[key]
def __iter__(self):
'''Iterate over the table tags in optimal order as per
http://partners.adobe.com/public/developer/opentype/index_recs.html'''
keys = list(self.tables.keys())
order = {x:i for i, x in enumerate((b'head', b'hhea', b'maxp', b'OS/2',
b'hmtx', b'LTSH', b'VDMX', b'hdmx', b'cmap', b'fpgm', b'prep',
b'cvt ', b'loca', b'glyf', b'CFF ', b'kern', b'name', b'post',
b'gasp', b'PCLT', b'DSIG'))}
keys.sort(key=lambda x:order.get(x, 1000))
for x in keys:
yield x
def pop(self, key, default=None): def pop(self, key, default=None):
return self.tables.pop(key, default) return self.tables.pop(key, default)
def sizes(self): def sizes(self):
ans = OrderedDict() ans = OrderedDict()
for tag in sorted(self.tables): for tag in self:
ans[tag] = len(self[tag]) ans[tag] = len(self[tag])
return ans return ans
@ -82,7 +96,7 @@ class Sfnt(object):
table_data = [] table_data = []
offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables ) offset = stream.tell() + ( calcsize(b'>4s3L') * num_tables )
sizes = OrderedDict() sizes = OrderedDict()
for tag in sorted(self.tables): for tag in self:
table = self.tables[tag] table = self.tables[tag]
raw = table() raw = table()
table_len = len(raw) table_len = len(raw)

View File

@ -66,6 +66,11 @@ def subset_truetype(sfnt, character_map):
# }}} # }}}
def subset_postscript(sfnt, character_map):
cff = sfnt[b'CFF ']
cff.decompile()
raise Exception('TODO: Implement CFF subsetting')
def subset(raw, individual_chars, ranges=()): def subset(raw, individual_chars, ranges=()):
chars = list(map(ord, individual_chars)) chars = list(map(ord, individual_chars))
for r in ranges: for r in ranges:
@ -91,7 +96,11 @@ def subset(raw, individual_chars, ranges=()):
subset_truetype(sfnt, character_map) subset_truetype(sfnt, character_map)
elif b'CFF ' in sfnt: elif b'CFF ' in sfnt:
# PostScript Outlines # PostScript Outlines
raise UnsupportedFont('This font contains PostScript outlines, ' from calibre.utils.config_base import tweaks
if tweaks['subset_cff_table']:
subset_postscript(sfnt, character_map)
else:
raise UnsupportedFont('This font contains PostScript outlines, '
'subsetting not supported') 'subsetting not supported')
else: else:
raise UnsupportedFont('This font does not contain TrueType ' raise UnsupportedFont('This font does not contain TrueType '

View File

@ -0,0 +1,267 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Try to read invalid zip files with missing or damaged central directories.
These are apparently produced in large numbers by the fruitcakes over at B&N.
Tries to only use the local headers to extract data from the damaged zip file.
'''
import os, sys, zlib, shutil
from struct import calcsize, unpack, pack
from collections import namedtuple, OrderedDict
from tempfile import SpooledTemporaryFile
HEADER_SIG = 0x04034b50
HEADER_BYTE_SIG = pack(b'<L', HEADER_SIG)
local_header_fmt = b'<L5HL2L2H'
local_header_sz = calcsize(local_header_fmt)
ZIP_STORED, ZIP_DEFLATED = 0, 8
LocalHeader = namedtuple('LocalHeader',
'signature min_version flags compression_method mod_time mod_date '
'crc32 compressed_size uncompressed_size filename_length extra_length '
'filename extra')
def decode_arcname(name):
if isinstance(name, bytes):
from calibre.ebooks.chardet import detect
try:
name = name.decode('utf-8')
except:
res = detect(name)
encoding = res['encoding']
try:
name = name.decode(encoding)
except:
name = name.decode('utf-8', 'replace')
return name
def find_local_header(f):
pos = f.tell()
raw = f.read(50*1024)
try:
f.seek(pos + raw.index(HEADER_BYTE_SIG))
except ValueError:
f.seek(pos)
return
raw = f.read(local_header_sz)
if len(raw) != local_header_sz:
f.seek(pos)
return
header = LocalHeader(*(unpack(local_header_fmt, raw) + (None, None)))
if header.signature == HEADER_SIG:
return header
f.seek(pos)
def read_local_file_header(f):
pos = f.tell()
raw = f.read(local_header_sz)
if len(raw) != local_header_sz:
f.seek(pos)
return
header = LocalHeader(*(unpack(local_header_fmt, raw) + (None, None)))
if header.signature != HEADER_SIG:
f.seek(pos)
header = find_local_header(f)
if header is None:
return
if header.min_version > 20:
raise ValueError('This ZIP file uses unsupported features')
if header.flags & 0b1:
raise ValueError('This ZIP file is encrypted')
if header.flags & (1 << 3):
raise ValueError('This ZIP file uses data descriptors. This is unsupported')
if header.flags & (1 << 13):
raise ValueError('This ZIP file uses masking, unsupported.')
if header.compression_method not in {ZIP_STORED, ZIP_DEFLATED}:
raise ValueError('This ZIP file uses an unsupported compression method')
fname = extra = None
if header.filename_length > 0:
fname = f.read(header.filename_length)
if len(fname) != header.filename_length:
return
try:
fname = fname.decode('ascii')
except UnicodeDecodeError:
if header.flags & (1 << 11):
try:
fname = fname.decode('utf-8')
except UnicodeDecodeError:
pass
fname = decode_arcname(fname).replace('\\', '/')
if header.extra_length > 0:
extra = f.read(header.extra_length)
if len(extra) != header.extra_length:
return
return LocalHeader(*(
header[:-2] + (fname, extra)
))
def read_compressed_data(f, header):
cdata = f.read(header.compressed_size)
return cdata
def copy_stored_file(src, size, dest):
read = 0
amt = min(size, 20*1024)
while read < size:
raw = src.read(min(size-read, amt))
if not raw:
raise ValueError('Premature end of file')
dest.write(raw)
read += len(raw)
def copy_compressed_file(src, size, dest):
d = zlib.decompressobj(-15)
read = 0
amt = min(size, 20*1024)
while read < size:
raw = src.read(min(size-read, amt))
read += len(raw)
dest.write(d.decompress(raw, 200*1024))
count = 0
while d.unconsumed_tail:
count += 1
dest.write(d.decompress(d.unconsumed_tail, 200*1024))
if count > 100:
raise ValueError('This ZIP file contains a ZIP bomb in %s'%
os.path.basename(dest.name))
def _extractall(f, path=None, file_info=None):
found = False
while True:
header = read_local_file_header(f)
if not header:
break
found = True
parts = header.filename.split('/')
if header.uncompressed_size == 0:
# Directory
f.seek(f.tell() + header.compressed_size)
if path is not None:
bdir = os.path.join(path, *parts)
if not os.path.exists(bdir):
os.makedirs(bdir)
continue
# File
if file_info is not None:
file_info[header.filename] = (f.tell(), header)
if path is not None:
bdir = os.path.join(path, *(parts[:-1]))
if not os.path.exists(bdir):
os.makedirs(bdir)
dest = os.path.join(path, *parts)
with open(dest, 'wb') as o:
if header.compression_method == ZIP_STORED:
copy_stored_file(f, header.compressed_size, o)
else:
copy_compressed_file(f, header.compressed_size, o)
else:
f.seek(f.tell() + header.compressed_size)
if not found:
raise ValueError('Not a ZIP file')
def extractall(path_or_stream, path=None):
f = path_or_stream
close_at_end = False
if not hasattr(f, 'read'):
f = open(f, 'rb')
close_at_end = True
if path is None:
path = os.getcwdu()
pos = f.tell()
try:
_extractall(f, path)
finally:
f.seek(pos)
if close_at_end:
f.close()
class LocalZipFile(object):
def __init__(self, stream):
self.file_info = OrderedDict()
_extractall(stream, file_info=self.file_info)
self.stream = stream
def open(self, name, spool_size=5*1024*1024):
if isinstance(name, LocalHeader):
name = name.filename
try:
offset, header = self.file_info.get(name)
except KeyError:
raise ValueError('This ZIP container has no file named: %s'%name)
self.stream.seek(offset)
dest = SpooledTemporaryFile(max_size=spool_size)
if header.compression_method == ZIP_STORED:
copy_stored_file(self.stream, header.compressed_size, dest)
else:
copy_compressed_file(self.stream, header.compressed_size, dest)
dest.seek(0)
return dest
def getinfo(self, name):
try:
offset, header = self.file_info.get(name)
except KeyError:
raise ValueError('This ZIP container has no file named: %s'%name)
return header
def read(self, name, spool_size=5*1024*1024):
with self.open(name, spool_size=spool_size) as f:
return f.read()
def extractall(self, path=None):
self.stream.seek(0)
_extractall(self.stream, path=(path or os.getcwdu()))
def close(self):
pass
def safe_replace(self, name, datastream, extra_replacements={},
add_missing=False):
from calibre.utils.zipfile import ZipFile, ZipInfo
replacements = {name:datastream}
replacements.update(extra_replacements)
names = frozenset(replacements.keys())
found = set([])
with SpooledTemporaryFile(max_size=100*1024*1024) as temp:
ztemp = ZipFile(temp, 'w')
for offset, header in self.file_info.itervalues():
if header.filename in names:
zi = ZipInfo(header.filename)
zi.compress_type = header.compression_method
ztemp.writestr(zi, replacements[header.filename].read())
found.add(header.filename)
else:
ztemp.writestr(header.filename, self.read(header.filename,
spool_size=0))
if add_missing:
for name in names - found:
ztemp.writestr(name, replacements[name].read())
ztemp.close()
zipstream = self.stream
temp.seek(0)
zipstream.seek(0)
zipstream.truncate()
shutil.copyfileobj(temp, zipstream)
zipstream.flush()
if __name__ == '__main__':
extractall(sys.argv[-1])

View File

@ -467,11 +467,11 @@ eject_drive_letter(WCHAR DriveLetter) {
DeviceNumber = -1; DeviceNumber = -1;
hVolume = CreateFile(szVolumeAccessPath, 0, hVolume = CreateFileW(szVolumeAccessPath, 0,
FILE_SHARE_READ | FILE_SHARE_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, OPEN_EXISTING, 0, NULL); NULL, OPEN_EXISTING, 0, NULL);
if (hVolume == INVALID_HANDLE_VALUE) { if (hVolume == INVALID_HANDLE_VALUE) {
PyErr_SetString(PyExc_ValueError, "Invalid handle value for drive letter"); PyErr_SetFromWindowsErr(0);
return FALSE; return FALSE;
} }
@ -529,11 +529,17 @@ eject_drive_letter(WCHAR DriveLetter) {
static PyObject * static PyObject *
winutil_eject_drive(PyObject *self, PyObject *args) { winutil_eject_drive(PyObject *self, PyObject *args) {
char DriveLetter; char letter = '0';
WCHAR DriveLetter = L'0';
if (!PyArg_ParseTuple(args, "c", &DriveLetter)) return NULL; if (!PyArg_ParseTuple(args, "c", &letter)) return NULL;
if (!eject_drive_letter((WCHAR)DriveLetter)) return NULL; if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, &letter, 1, &DriveLetter, 1) == 0) {
PyErr_SetFromWindowsErr(0);
return NULL;
}
if (!eject_drive_letter(DriveLetter)) return NULL;
Py_RETURN_NONE; Py_RETURN_NONE;
} }