sync to trunk.

This commit is contained in:
John Schember 2011-10-12 17:54:56 -04:00
commit fd288645d0
47 changed files with 2246 additions and 690 deletions

View File

@ -0,0 +1,62 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.defensenews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DefenseNews(BasicNewsRecipe):
title = 'Defense News'
__author__ = 'Darko Miletic'
description = 'Find late-breaking defense news from the leading defense news weekly'
publisher = 'Gannett Government Media Corporation'
category = 'defense news, defence news, defense, defence, defence budget, defence policy'
oldest_article = 31
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.defensenews.com/images/logo_defensenews2.jpg'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
.info{font-size: small; color: gray}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [
dict(name=['meta','link'])
,dict(attrs={'class':['toolbar','related','left','right']})
]
remove_tags_before = attrs={'class':'storyWrp'}
remove_tags_after = attrs={'class':'middle'}
remove_attributes=['lang']
feeds = [
(u'Europe' , u'http://www.defensenews.com/rss/eur/' )
,(u'Americas', u'http://www.defensenews.com/rss/ame/' )
,(u'Asia & Pacific rim', u'http://www.defensenews.com/rss/asi/' )
,(u'Middle east & Africa', u'http://www.defensenews.com/rss/mid/')
,(u'Air', u'http://www.defensenews.com/rss/air/' )
,(u'Land', u'http://www.defensenews.com/rss/lan/' )
,(u'Naval', u'http://www.defensenews.com/rss/sea/' )
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup

View File

@ -2,6 +2,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
http://www.dilbert.com
DrMerry added cover Image 2011-11-12
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
@ -9,7 +10,7 @@ import re
class DilbertBig(BasicNewsRecipe):
title = 'Dilbert'
__author__ = 'Darko Miletic and Starson17'
__author__ = 'Darko Miletic and Starson17 contribution of DrMerry'
description = 'Dilbert'
reverse_article_order = True
oldest_article = 15
@ -20,6 +21,7 @@ class DilbertBig(BasicNewsRecipe):
publisher = 'UNITED FEATURE SYNDICATE, INC.'
category = 'comic'
language = 'en'
cover_url = 'http://dilbert.com/mobile/mobile/dilbert.app.icon.png'
conversion_options = {
'comments' : description

View File

@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe):
' perspective. Best downloaded on Friday mornings (GMT)')
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
oldest_article = 7.0
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
@ -56,6 +54,14 @@ class Economist(BasicNewsRecipe):
return br
'''
def get_cover_url(self):
br = self.browser
br.open(self.INDEX)
issue = br.geturl().split('/')[4]
self.log('Fetching cover for issue: %s'%issue)
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
return cover_url
def parse_index(self):
return self.economist_parse_index()

View File

@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe):
' perspective. Best downloaded on Friday mornings (GMT)')
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
oldest_article = 7.0
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
@ -40,6 +38,14 @@ class Economist(BasicNewsRecipe):
# downloaded with connection reset by peer (104) errors.
delay = 1
def get_cover_url(self):
br = self.browser
br.open(self.INDEX)
issue = br.geturl().split('/')[4]
self.log('Fetching cover for issue: %s'%issue)
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
return cover_url
def parse_index(self):
try:

View File

@ -19,45 +19,20 @@ class FazNet(BasicNewsRecipe):
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'Article'})]
remove_tags = [
dict(name=['object','link','embed','base'])
,dict(name='div',
attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo',
'ArtikelServices', 'ModulLesermeinungenFooter',
'ModulArtikelServices', 'BoxTool Aufklappen_Grau',
'SocialMediaUnten', ]}),
dict(id=['KurzLinkMenu', 'ArtikelServicesMenu']),
]
keep_only_tags = [{'class':'FAZArtikelEinleitung'},
{'id':'ArtikelTabContent_0'}]
feeds = [
('FAZ.NET Aktuell', 'http://www.faz.net/s/RubF3CE08B362D244869BE7984590CB6AC1/Tpl~Epartner~SRss_.xml'),
('Politik', 'http://www.faz.net/s/RubA24ECD630CAE40E483841DB7D16F4211/Tpl~Epartner~SRss_.xml'),
('Wirtschaft', 'http://www.faz.net/s/RubC9401175958F4DE28E143E68888825F6/Tpl~Epartner~SRss_.xml'),
('Feuilleton', 'http://www.faz.net/s/RubCC21B04EE95145B3AC877C874FB1B611/Tpl~Epartner~SRss_.xml'),
('Sport', 'http://www.faz.net/s/Rub9F27A221597D4C39A82856B0FE79F051/Tpl~Epartner~SRss_.xml'),
('Gesellschaft', 'http://www.faz.net/s/Rub02DBAA63F9EB43CEB421272A670A685C/Tpl~Epartner~SRss_.xml'),
('Finanzen', 'http://www.faz.net/s/Rub4B891837ECD14082816D9E088A2D7CB4/Tpl~Epartner~SRss_.xml'),
('Wissen', 'http://www.faz.net/s/Rub7F4BEE0E0C39429A8565089709B70C44/Tpl~Epartner~SRss_.xml'),
('Reise', 'http://www.faz.net/s/RubE2FB5CA667054BDEA70FB3BC45F8D91C/Tpl~Epartner~SRss_.xml'),
('Technik & Motor', 'http://www.faz.net/s/Rub01E4D53776494844A85FDF23F5707AD8/Tpl~Epartner~SRss_.xml'),
('Beruf & Chance', 'http://www.faz.net/s/RubB1E10A8367E8446897468EDAA6EA0504/Tpl~Epartner~SRss_.xml')
('FAZ.NET Aktuell', 'http://www.faz.net/aktuell/?rssview=1'),
('Politik', 'http://www.faz.net/aktuell/politik/?rssview=1'),
('Wirtschaft', 'http://www.faz.net/aktuell/wirtschaft/?rssview=1'),
('Feuilleton', 'http://www.faz.net/aktuell/feuilleton/?rssview=1'),
('Sport', 'http://www.faz.net/aktuell/sport/?rssview=1'),
('Gesellschaft', 'http://www.faz.net/aktuell/gesellschaft/?rssview=1'),
('Finanzen', 'http://www.faz.net/aktuell/finanzen/?rssview=1'),
('Technik & Motor', 'http://www.faz.net/aktuell/technik-motor/?rssview=1'),
('Wissen', 'http://www.faz.net/aktuell/wissen/?rssview=1'),
('Reise', 'http://www.faz.net/aktuell/reise/?rssview=1'),
('Beruf & Chance', 'http://www.faz.net/aktuell/beruf-chance/?rssview=1')
]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,35 +1,71 @@
#!/usr/bin/python
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
class AdvancedUserRecipe1307556816(BasicNewsRecipe):
title = u'Geek and Poke'
__author__ = u'DrMerry'
description = u'Geek and Poke Cartoons'
publisher = u'Oliver Widder'
author = u'Oliver Widder, DrMerry (calibre-code), calibre'
oldest_article = 31
max_articles_per_feed = 100
language = u'en'
simultaneous_downloads = 5
#delay = 1
timefmt = ' [%A, %d %B, %Y]'
timefmt = ' [%a, %d %B, %Y]'
summary_length = -1
no_stylesheets = True
category = 'News.IT, Cartoon, Humor, Geek'
use_embedded_content = False
cover_url = 'http://geekandpoke.typepad.com/aboutcoders.jpeg'
remove_javascript = True
remove_empty_feeds = True
publication_type = 'blog'
conversion_options = {
'comments' : ''
,'tags' : category
,'language' : language
,'publisher' : publisher
,'author' : author
}
preprocess_regexps = [ (re.compile(r'(<p>&nbsp;</p>|<iframe.*</iframe>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>)', re.DOTALL|re.IGNORECASE),lambda match: ''),
(re.compile(r'(&nbsp;| )', re.DOTALL|re.IGNORECASE),lambda match: ' '),
(re.compile(r'<br( /)?>(<br( /)?>)+', re.DOTALL|re.IGNORECASE),lambda match: '<br>')
]
remove_tags_before = dict(name='p', attrs={'class':'content-nav'})
remove_tags_after = dict(name='div', attrs={'class':'entry-content'})
remove_tags = [dict(name='div', attrs={'class':'entry-footer'}),
dict(name='div', attrs={'id':'alpha'}),
dict(name='div', attrs={'id':'gamma'}),
dict(name='iframe'),
dict(name='p', attrs={'class':'content-nav'})]
extra_css = 'body, h3, p, h2, h1, div, span{margin:0px} h2.date-header {font-size: 0.7em; color:#eee;} h3.entry-header{font-size: 1.0em} div.entry-body{font-size: 0.9em}'
filter_regexps = [(r'feedburner\.com'),
(r'pixel.quantserve\.com'),
(r'googlesyndication\.com'),
(r'yimg\.com'),
(r'scorecardresearch\.com')]
preprocess_regexps = [(re.compile(r'(<p>(&nbsp;|\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>)', re.DOTALL|re.IGNORECASE),lambda match: ''),
(re.compile(r'(&nbsp;|\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
(re.compile(r'<h2[^>]*>([^<]*)</h2>[^>]*(<div[^>]*>)', re.DOTALL|re.IGNORECASE), lambda match: match.group(2) + '<div id="MERRYdate">' + match.group(1) + '</div>'),
(re.compile(r'(<h3[^>]*>)<a[^>]>((?!</a)*)</a></h3>', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + match.group(2) + '</h3>'),
(re.compile(r'(<img[^>]*alt="([^"]*)"[^>]*>)', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + '<br><cite>' + match.group(2) + '</cite>'),
(re.compile(r'<br( /)?>(<br( /)?>)+', re.DOTALL|re.IGNORECASE),lambda match: '<br>'),
(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')
]
remove_tags_before = dict(name='h2', attrs={'class':'date-header'})
remove_tags_after = dict(name='div', attrs={'class':'entry-body'})
extra_css = 'body, h3, p, #MERRYdate, h1, div, span{margin:0px; padding:0px} h3.entry-header{font-size: 0.8em} div.entry-body{font-size: 0.7em} #MERRYdate {font-size: 0.5em}'
def postprocess_html(self, soup, first):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
width, height = img.size
#print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
img.trim(0)
img.save(iurl)
width, height = img.size
#print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
return soup
feeds = [(u'Geek and Poke', u'http://feeds.feedburner.com/GeekAndPoke?format=xml')]
feeds = ['http://feeds.feedburner.com/GeekAndPoke?format=xml']

View File

@ -119,10 +119,8 @@ class Guardian(BasicNewsRecipe):
}
def parse_index(self):
try:
feeds = []
for title, href in self.find_sections():
feeds.append((title, list(self.find_articles(href))))
return feeds
except:
raise NotImplementedError
feeds = []
for title, href in self.find_sections():
feeds.append((title, list(self.find_articles(href))))
return feeds

View File

@ -1,7 +1,9 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe(BasicNewsRecipe):
title = 'Heise-online'
title = 'heise online'
description = 'News vom Heise-Verlag'
__author__ = 'schuster'
use_embedded_content = False
@ -12,10 +14,11 @@ class AdvancedUserRecipe(BasicNewsRecipe):
remove_empty_feeds = True
timeout = 5
no_stylesheets = True
encoding = 'utf-8'
remove_tags_after = dict(name ='p', attrs={'class':'editor'})
remove_tags = [dict(id='navi_top_container'),
remove_tags = [{'class':'navi_top_container'},
dict(id='navi_bottom'),
dict(id='mitte_rechts'),
dict(id='navigation'),
@ -25,28 +28,28 @@ class AdvancedUserRecipe(BasicNewsRecipe):
dict(id='content_foren'),
dict(id='seiten_navi'),
dict(id='adbottom'),
dict(id='sitemap')]
dict(id='sitemap'),
dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')),
]
feeds = [
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
('iX', 'http://www.heise.de/ix/news/news.rdf'),
('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Security', 'http://www.heise.de/security/news/news-atom.xml'),
('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
('Autos', 'http://www.heise.de/autos/rss/news.rdf'),
('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'),
('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')
]
]
def print_version(self, url):
return url + '?view=print'

View File

@ -18,6 +18,7 @@ class HoustonChronicle(BasicNewsRecipe):
keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or
'hst-articletext' in x or 'hst-galleryitem' in x)}
remove_attributes = ['xmlns']
feeds = [
('News', "http://www.chron.com/rss/feed/News-270.php"),

View File

@ -0,0 +1,27 @@
from calibre.web.feeds.news import BasicNewsRecipe
class MercoPress(BasicNewsRecipe):
title = u'Merco Press'
description = u"Read News, Stories and Insight Analysis from Latin America and Mercosur. Politics, Economy, Business and Investments in South America."
cover_url = 'http://en.mercopress.com/web/img/en/mercopress-logo.gif'
__author__ = 'Russell Phillips'
language = 'en'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
extra_css = 'img{padding-bottom:1ex; display:block; text-align: center;}'
remove_tags = [dict(name='a')]
feeds = [('Antarctica', 'http://en.mercopress.com/rss/antarctica'),
('Argentina', 'http://en.mercopress.com/rss/argentina'),
('Brazil', 'http://en.mercopress.com/rss/brazil'),
('Falkland Islands', 'http://en.mercopress.com/rss/falkland-islands'),
('International News', 'http://en.mercopress.com/rss/international'),
('Latin America', 'http://en.mercopress.com/rss/latin-america'),
('Mercosur', 'http://en.mercopress.com/rss/mercosur'),
('Paraguay', 'http://en.mercopress.com/rss/paraguay'),
('United States', 'http://en.mercopress.com/rss/united-states'),
('Uruguay://en.mercopress.com/rss/uruguay')]

View File

@ -5,30 +5,46 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
description = 'News as provide by The Metro -UK'
__author__ = 'Dave Asbury'
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
no_stylesheets = True
oldest_article = 1
max_articles_per_feed = 25
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
#preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
preprocess_regexps = [
(re.compile(r'<span class="img-cap legend">', re.IGNORECASE | re.DOTALL), lambda match: '<p></p><span class="img-cap legend"> ')]
preprocess_regexps = [
(re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
extra_css = 'h2 {font: sans-serif medium;}'
keep_only_tags = [
dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
dict(attrs={'class':['img-cnt figure']}),
dict(attrs={'class':['art-img']}),
dict(name='div', attrs={'class':'art-lft'})
dict(name='div', attrs={'class':'art-lft'}),
dict(name='p')
]
remove_tags = [dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r' ]}),
dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']})
]
dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']})
,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
]
feeds = [
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
extra_css = '''
body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''

View File

@ -6,19 +6,24 @@ __Region__ = 'Hong Kong'
# Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False".
__MakePeriodical__ = True
# Turn below to true if your device supports display of CJK titles
# Turn below to True if your device supports display of CJK titles
__UseChineseTitle__ = False
# Set it to False if you want to skip images
__KeepImages__ = True
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source
__UseLife__ = True
# (HK only) if __UseLife__ is true, turn this on if you want to include the column section
# (HK only) It is to disable the column section which is now a premium content
__InclCols__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats
__ParsePFF__ = False
# (HK only) Turn below to True if you wish hi-res images
__HiResImg__ = False
'''
Change Log:
2011/09/21: fetching "column" section is made optional. Default is False
2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
@ -42,7 +47,7 @@ Change Log:
2010/10/31: skip repeated articles in section pages
'''
import os, datetime, re
import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -56,7 +61,7 @@ class MPRecipe(BasicNewsRecipe):
title = 'Ming Pao - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
@ -147,43 +152,6 @@ class MPRecipe(BasicNewsRecipe):
conversion_options = {'linearize_tables':True}
timefmt = ''
def image_url_processor(cls, baseurl, url):
# trick: break the url at the first occurance of digit, add an additional
# '_' at the front
# not working, may need to move this to preprocess_html() method
# minIdx = 10000
# i0 = url.find('0')
# if i0 >= 0 and i0 < minIdx:
# minIdx = i0
# i1 = url.find('1')
# if i1 >= 0 and i1 < minIdx:
# minIdx = i1
# i2 = url.find('2')
# if i2 >= 0 and i2 < minIdx:
# minIdx = i2
# i3 = url.find('3')
# if i3 >= 0 and i0 < minIdx:
# minIdx = i3
# i4 = url.find('4')
# if i4 >= 0 and i4 < minIdx:
# minIdx = i4
# i5 = url.find('5')
# if i5 >= 0 and i5 < minIdx:
# minIdx = i5
# i6 = url.find('6')
# if i6 >= 0 and i6 < minIdx:
# minIdx = i6
# i7 = url.find('7')
# if i7 >= 0 and i7 < minIdx:
# minIdx = i7
# i8 = url.find('8')
# if i8 >= 0 and i8 < minIdx:
# minIdx = i8
# i9 = url.find('9')
# if i9 >= 0 and i9 < minIdx:
# minIdx = i9
return url
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
if __Region__ == 'Hong Kong':
@ -260,15 +228,16 @@ class MPRecipe(BasicNewsRecipe):
else:
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
# special- editorial
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
if ed_articles:
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
#if ed_articles:
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -279,20 +248,39 @@ class MPRecipe(BasicNewsRecipe):
# special - finance
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
#if fin_articles:
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
articles = self.parse_section(url)
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
articles = self.parse_section2(url, keystr)
if articles:
feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
# special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles:
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
#if ent_articles:
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
articles = self.parse_section2(url, keystr)
if articles:
feeds.append((title, articles))
if __InclCols__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -300,11 +288,6 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
# special- columns
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
if col_articles:
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
elif __Region__ == 'Vancouver':
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -348,6 +331,16 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a)
url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version
if __ParsePFF__ == True:
if url.rfind('Redirect') <> -1:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_')
else:
url = url.replace('.htm', '_print.htm')
if url not in included_urls and url.rfind('Redirect') == -1:
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url)
@ -472,38 +465,119 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse()
return current_articles
# preprocess those .txt based files
# preprocess those .txt and javascript based files
def preprocess_raw_html(self, raw_html, url):
if url.rfind('ftp') == -1:
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True:
# TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', raw_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
for img in imglist:
gifimg = img.replace('jpg"', 'gif"')
try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
raw_html = raw_html.replace(img, gifimg)
except:
# find the location of the first _
pos = img.find('_')
if pos > -1:
# if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:]
raw_html = raw_html.replace(img, newimg)
else:
# if not found, insert _ after "
raw_html = raw_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', raw_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
#print 'Img list: ', imglist, '\n'
for img in imglist:
gifimg = img.replace('jpg\'', 'gif\'')
try:
#print 'Original: ', url
#print 'To append: ', "/../" + gifimg[5:len(gifimg)-1]
gifurl = re.sub(r'dailynews.*txt', '', url)
#print 'newurl: ', gifurl + gifimg[5:len(gifimg)-1]
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
#print 'URL: ', url + "/../" + gifimg[5:len(gifimg)-1]
#br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
raw_html = raw_html.replace(img, gifimg)
except:
#print 'GIF not found'
pos = img.rfind('/')
newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'newimg: ', newimg
raw_html = raw_html.replace(img, newimg)
if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1:
return raw_html
else:
splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_img_txt = False
title_started = False
met_article_start_char = False
for item in splitter.split(raw_html):
if item.startswith(u'\u3010'):
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else:
if next_is_img_txt == False:
if item.startswith('='):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
if url.rfind('_print.htm') <> -1:
# javascript based file
splitter = re.compile(r'\n')
new_raw_html = '<html><head><title>Untitled</title></head>'
new_raw_html = new_raw_html + '<body>'
for item in splitter.split(raw_html):
if item.startswith('var heading1 ='):
heading = item.replace('var heading1 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
new_raw_html = new_raw_html + '<div class="heading">' + heading
if item.startswith('var heading2 ='):
heading = item.replace('var heading2 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
if heading <> '':
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
else:
if met_article_start_char == False:
if title_started == False:
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '<p>\n'
new_raw_html = new_raw_html + '</div>'
if item.startswith('var content ='):
content = item.replace("var content = ", '')
content = content.replace('\'', '')
content = content.replace(';', '')
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
if item.startswith('var photocontent ='):
photo = item.replace('var photocontent = \'', '')
photo = photo.replace('\'', '')
photo = photo.replace(';', '')
photo = photo.replace('<tr>', '')
photo = photo.replace('<td>', '')
photo = photo.replace('</tr>', '')
photo = photo.replace('</td>', '<br>')
photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
return new_raw_html + '</body></html>'
else:
# .txt based file
splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_img_txt = False
title_started = False
met_article_start_char = False
for item in splitter.split(raw_html):
if item.startswith(u'\u3010'):
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
return new_raw_html + '</div></body></html>'
if next_is_img_txt == False:
if item.startswith('='):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
else:
if met_article_start_char == False:
if title_started == False:
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '<p>\n'
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
return new_raw_html + '</div></body></html>'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
@ -604,7 +678,7 @@ class MPRecipe(BasicNewsRecipe):
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'),
play_order=po, author=auth, description=desc)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:

View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class MercoPress(BasicNewsRecipe):
title = u'Penguin News'
description = u"Penguin News: the Falkland Islands' only newspaper."
cover_url = 'http://www.penguin-news.com/templates/rt_syndicate_j15/images/logo/light/logo1.png'
language = 'en'
__author__ = 'Russell Phillips'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
extra_css = 'img{padding-bottom:1ex; display:block; text-align: center;}'
feeds = [(u'Penguin News - Falkland Islands', u'http://www.penguin-news.com/index.php?format=feed&amp;type=rss')]

View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class RevistaPiaui(BasicNewsRecipe):
title = u'Revista piau\xed'
language = 'pt_BR'
__author__ = u'Eduardo Gustini Simões'
oldest_article = 31
max_articles_per_feed = 50
auto_cleanup = True
feeds = [(u'Edi\xe7\xe3o Atual', u'http://revistapiaui.estadao.com.br/feed/rss/edicao-atual.xml')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
soup = self.index_to_soup('http://revistapiaui.estadao.com.br/feed/rss/edicao-atual.xml')
itemTitle = article.title.partition('|')[0].rstrip()
item = soup.find(text=itemTitle)
articleDescription = item.parent.parent.description.string.partition('<br />')[2]
article.summary = articleDescription
return feeds
def populate_article_metadata(self, article, soup, first):
h2 = soup.find('h2')
h2.string.replaceWith(h2.string.partition('|')[0].rstrip())
h2.replaceWith(h2.prettify() + '<p><em>' + article.summary + '</em></p><p><em>' + ' posted at ' + article.localtime.strftime('%d-%m-%Y') + '</em></p>')

View File

@ -9,285 +9,79 @@ calibre recipe for slate.com
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
class Slate(BasicNewsRecipe):
# Method variables for customizing downloads
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
__author__ = 'GRiker, Sujata Raman and Nick Redding'
max_articles_per_feed = 100
oldest_article = 14
recursions = 0
delay = 0
simultaneous_downloads = 5
timeout = 120.0
__author__ = 'Kovid Goyal'
timefmt = ''
feeds = None
no_stylesheets = True
encoding = None
language = 'en'
title = 'Slate'
INDEX = 'http://slate.com'
encoding = 'utf-8'
preprocess_regexps = [
(re.compile(r'<!--.*?-->', re.DOTALL), lambda x: ''),
(re.compile(r'^.*?<html', re.DOTALL), lambda x:'<html'),
(re.compile(r'<meta[^>]+?/>', re.DOTALL), lambda x:''),
]
remove_tags = [
{'name':['link', 'script']},
{'class':['share-box-flank', 'sl-crumbs', 'sl-tbar',
'sl-chunky-tbar']},
]
remove_tags_after = [{'class':'sl-art-creds-cntr'}]
keep_only_tags = {'class':'sl-body-wrapper'}
remove_attributes = ['style']
slate_complete = True
if slate_complete:
title = 'Slate (complete)'
else:
title = 'Slate (weekly)'
def print_version(self, url):
return url.replace('.html', '.single.html')
# Method variables for customizing feed parsing
summary_length = 250
use_embedded_content = None
# Method variables for pre/post processing of HTML
preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>',
re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>',
re.DOTALL|re.IGNORECASE),
lambda match: '') ]
match_regexps = []
# The second entry is for 'Big Money', which comes from a different site, uses different markup
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
dict(attrs={ 'id':['content']}) ]
# The second entry is for 'Big Money', which comes from a different site, uses different markup
remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
'comments_button','add_comments_button','comments-to-fray','marriott_ad',
'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
excludedTitleKeywords = ['Gabfest','Slate V','on Twitter']
excludedAuthorKeywords = []
excludedContentKeywords = ['http://twitter.com/Slate']
extra_css = '''
.h1_subhead{font-family:Arial; font-size:small; }
h1{font-family:Verdana; font-size:large; }
.byline {font-family:Georgia; margin-bottom: 0px; }
.dateline {font-family:Arial; font-size: smaller; height: 0pt;}
.imagewrapper {font-family:Verdana;font-size:x-small; }
.source {font-family:Verdana; font-size:x-small;}
.credit {font-family:Verdana; font-size: smaller;}
#article_body {font-family:Verdana; }
#content {font-family:Arial; }
.caption{font-family:Verdana;font-style:italic; font-size:x-small;}
h3{font-family:Arial; font-size:small}
'''
# Local variables to extend class
baseURL = 'http://slate.com'
section_dates = []
# class extension methods
def tag_to_strings(self, tag):
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = self.tag_to_string(item,use_alt=False)
if res:
strings.append(res)
return strings
def extract_named_sections(self):
soup = self.index_to_soup( self.baseURL )
soup_nav_bar = soup.find(True, attrs={'id':'nav'})
briefing_nav = soup.find('li')
briefing_url = briefing_nav.a['href']
for section_nav in soup_nav_bar.findAll('li'):
section_name = self.tag_to_string(section_nav,use_alt=False)
self.section_dates.append(section_name)
soup = self.index_to_soup(briefing_url)
self.log("Briefing url = %s " % briefing_url)
section_lists = soup.findAll('ul','view_links_list')
sections = []
for section in section_lists :
sections.append(section)
return sections
def extract_dated_sections(self):
soup = self.index_to_soup( self.baseURL )
soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
if soup_top_stories:
self.section_dates.append("Top Stories")
self.log("SELECTION TOP STORIES %s" % "Top Stories")
soup = soup.find(True, attrs={'id':'toc_links_container'})
todays_section = soup.find(True, attrs={'class':'todaydateline'})
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
for older_section in older_section_dates :
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
if soup_top_stories:
headline_stories = soup_top_stories
self.log("HAVE top_stories")
else:
headline_stories = None
self.log("NO top_stories")
section_lists = soup.findAll('ul')
# Prepend the headlines to the first section
if headline_stories:
section_lists.insert(0,headline_stories)
sections = []
for section in section_lists :
sections.append(section)
return sections
def extract_section_articles(self, sections_html) :
# Find the containers with section content
sections = sections_html
articles = {}
key = None
def parse_index(self) :
ans = []
for (i,section) in enumerate(sections) :
# Get the section name
if section.has_key('id') :
self.log("PROCESSING SECTION id = %s" % section['id'])
key = self.section_dates[i]
if key.startswith("Pod"):
continue
if key.startswith("Blog"):
continue
articles[key] = []
ans.append(key)
elif self.slate_complete:
key = self.section_dates[i]
if key.startswith("Pod"):
continue
if key.startswith("Blog"):
continue
self.log("PROCESSING SECTION name = %s" % key)
articles[key] = []
ans.append(key)
else :
self.log("SECTION %d HAS NO id" % i);
continue
# Get the section article_list
article_list = section.findAll('li')
# Extract the article attributes
for article in article_list :
bylines = self.tag_to_strings(article)
url = article.a['href']
title = bylines[0]
full_title = self.tag_to_string(article,use_alt=False)
#self.log("ARTICLE TITLE%s" % title)
#self.log("ARTICLE FULL_TITLE%s" % full_title)
#self.log("URL %s" % url)
author = None
description = None
pubdate = None
if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
description = "A summary of what's in the major U.S. newspapers."
if len(bylines) == 3 :
author = bylines[2].strip()
author = re.sub('[\r][\n][\t][\t\t]','', author)
author = re.sub(',','', author)
if bylines[1] is not None :
description = bylines[1]
full_byline = self.tag_to_string(article)
if full_byline.find('major U.S. newspapers') > 0 :
description = "A summary of what's in the major U.S. newspapers."
if len(bylines) > 3 and author is not None:
author += " | "
for (i,substring) in enumerate(bylines[3:]) :
#print "substring: %s" % substring.encode('cp1252')
author += substring.strip()
if i < len(bylines[3:]) :
author += " | "
# Skip articles whose descriptions contain excluded keywords
if description is not None and len(self.excludedDescriptionKeywords):
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
found_excluded = excluded.search(description)
if found_excluded :
self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
# Skip articles whose title contain excluded keywords
if full_title is not None and len(self.excludedTitleKeywords):
excluded = re.compile('|'.join(self.excludedTitleKeywords))
#self.log("evaluating full_title: %s" % full_title)
found_excluded = excluded.search(full_title)
if found_excluded :
self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
# Skip articles whose author contain excluded keywords
if author is not None and len(self.excludedAuthorKeywords):
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
found_excluded = excluded.search(author)
if found_excluded :
self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
skip_this_article = False
# Check to make sure we're not adding a duplicate
for article in articles[key] :
if article['url'] == url :
skip_this_article = True
self.log("SKIPPING DUP %s" % url)
break
if skip_this_article :
continue
# Build the dictionary entry for this article
feed = key
if not articles.has_key(feed) :
articles[feed] = []
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
author=author, content=''))
#self.log("KEY %s" % feed)
#self.log("APPENDED %s" % url)
# Promote 'newspapers' to top
for (i,article) in enumerate(articles[feed]) :
if article['description'] is not None :
if article['description'].find('newspapers') > 0 :
articles[feed].insert(0,articles[feed].pop(i))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
for sectitle, url in (
('News & Politics', '/articles/news_and_politics.html'),
('Technology', '/articles/technology.html'),
('Business', '/articles/business.html'),
('Arts', '/articles/arts.html'),
('Life', '/articles/life.html'),
('Health & Science', '/articles/health_and_science.html'),
('Sports', '/articles/sports.html'),
('Double X', '/articles/double_x.html'),
):
url = self.INDEX + url
self.log('Found section:', sectitle)
articles = self.slate_section_articles(self.index_to_soup(url))
if articles:
ans.append((sectitle, articles))
return ans
def print_version(self, url) :
return url + 'pagenum/all/'
# Class methods
def parse_index(self) :
if self.slate_complete:
sections = self.extract_named_sections()
else:
sections = self.extract_dated_sections()
section_list = self.extract_section_articles(sections)
return section_list
def slate_section_articles(self, soup):
cont = soup.find('div', id='most_read')
seen = set()
ans = []
for h4 in cont.findAll('h4'):
a = h4.find('a', href=True)
if a is None: continue
url = a['href']
if url.startswith('/'):
url = self.INDEX + url
if url in seen: continue
seen.add(url)
title = self.tag_to_string(a)
parent = h4.parent
h3 = parent.find('h3')
desc = ''
if h3 is not None:
desc = self.tag_to_string(h3)
a = parent.find('a', rel='author')
if a is not None:
a = self.tag_to_string(a)
art = {'title':title, 'description':desc, 'date':'', 'url':url}
if a:
art['author'] = a
self.log('\tFound article:', title, ' by ', a)
ans.append(art)
return ans
def get_masthead_url(self):
masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
@ -299,153 +93,4 @@ class Slate(BasicNewsRecipe):
masthead = None
return masthead
def stripAnchors(self,soup):
body = soup.find('div',attrs={'id':['article_body','content']})
if body is not None:
paras = body.findAll('p')
if paras is not None:
for para in paras:
aTags = para.findAll('a')
if aTags is not None:
for a in aTags:
if a.img is None:
#print repr(a.renderContents())
a.replaceWith(a.renderContents().decode('utf-8','replace'))
return soup
def preprocess_html(self, soup) :
# Remove 'grayPlus4.png' images
imgs = soup.findAll('img')
if imgs is not None:
for img in imgs:
if re.search("grayPlus4.png",str(img)):
img.extract()
# Delete article based upon content keywords
if len(self.excludedDescriptionKeywords):
excluded = re.compile('|'.join(self.excludedContentKeywords))
found_excluded = excluded.search(str(soup))
if found_excluded :
print "No allowed content found, removing article"
raise Exception('Rejected article')
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
head = soup.find('head')
if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
byline = soup.find('div',attrs={'id':'byline'})
if byline is not None:
byline['class'] = byline['id']
dateline = soup.find('div',attrs={'id':'dateline'})
if dateline is not None:
dateline['class'] = dateline['id']
body = soup.find('div',attrs={'id':'content'})
if body is not None:
body['class'] = 'article_body'
# Synthesize a department kicker
h3Tag = Tag(soup,'h3')
emTag = Tag(soup,'em')
emTag.insert(0,NavigableString("the big money: Today's business press"))
h3Tag.insert(0,emTag)
soup.body.insert(0,h3Tag)
# Strip anchors from HTML
return self.stripAnchors(soup)
def postprocess_html(self, soup, first_fetch) :
# Fix up dept_kicker as <h3><em>
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
if dept_kicker is not None :
kicker_strings = self.tag_to_strings(dept_kicker)
kicker = ''.join(kicker_strings[2:])
kicker = re.sub('\.','',kicker)
h3Tag = Tag(soup, "h3")
emTag = Tag(soup, "em")
emTag.insert(0,NavigableString(kicker))
h3Tag.insert(0, emTag)
dept_kicker.replaceWith(h3Tag)
else:
self.log("No kicker--return null")
return None
# Fix up the concatenated byline and dateline
byline = soup.find(True,attrs={'class':'byline'})
if byline is not None :
bylineTag = Tag(soup,'div')
bylineTag['class'] = 'byline'
#bylineTag['height'] = '0em'
bylineTag.insert(0,self.tag_to_string(byline))
byline.replaceWith(bylineTag)
dateline = soup.find(True, attrs={'class':'dateline'})
if dateline is not None :
datelineTag = Tag(soup, 'div')
datelineTag['class'] = 'dateline'
#datelineTag['margin-top'] = '0em'
datelineTag.insert(0,self.tag_to_string(dateline))
dateline.replaceWith(datelineTag)
# Change captions to italic, add <hr>
for caption in soup.findAll(True, {'class':'caption'}) :
if caption is not None:
emTag = Tag(soup, "em")
emTag.insert(0, '<br />' + self.tag_to_string(caption))
hrTag = Tag(soup, 'hr')
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Fix photos
for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
if photo.a is not None and photo.a.img is not None:
divTag = Tag(soup,'div')
divTag['class'] ='imagewrapper'
divTag.insert(0,photo.a.img)
photo.replaceWith(divTag)
return soup
def postprocess_book(self, oeb, opts, log) :
def extract_byline(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find(True,attrs={'class':'byline'})
if byline is not None:
return self.tag_to_string(byline,use_alt=False)
else :
return None
def extract_description(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
paragraphs = soup.findAll('p')
for p in paragraphs :
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
self.tag_to_string(p,use_alt=False).startswith('Posted '):
continue
comment = p.find(text=lambda text:isinstance(text, Comment))
if comment is not None:
continue
else:
return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
return None
# Method entry point here
# Single section toc looks different than multi-section tocs
if oeb.toc.depth() == 2 :
for article in oeb.toc :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)
elif oeb.toc.depth() == 3 :
for section in oeb.toc :
for article in section :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)

17
recipes/wow.recipe Normal file
View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class WoW(BasicNewsRecipe):
title = u'WoW Insider'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('WoW',
'http://wow.joystiq.com/rss.xml')
]

View File

@ -225,7 +225,10 @@ except:
try:
HOST=get_ip_address('wlan0')
except:
HOST='192.168.1.2'
try:
HOST=get_ip_address('ppp0')
except:
HOST='192.168.1.2'
PROJECT=os.path.basename(os.path.abspath('.'))

View File

@ -336,7 +336,7 @@ class Build(Command):
oinc = ['/Fo'+obj] if iswindows else ['-o', obj]
cmd = [compiler] + cflags + ext.cflags + einc + sinc + oinc
self.info(' '.join(cmd))
subprocess.check_call(cmd)
self.check_call(cmd)
dest = self.dest(ext)
elib = self.lib_dirs_to_ldflags(ext.lib_dirs)
@ -350,18 +350,32 @@ class Build(Command):
else:
cmd += objects + ext.extra_objs + ['-o', dest] + ldflags + ext.ldflags + elib + xlib
self.info('\n\n', ' '.join(cmd), '\n\n')
subprocess.check_call(cmd)
self.check_call(cmd)
if iswindows:
#manifest = dest+'.manifest'
#cmd = [MT, '-manifest', manifest, '-outputresource:%s;2'%dest]
#self.info(*cmd)
#subprocess.check_call(cmd)
#self.check_call(cmd)
#os.remove(manifest)
for x in ('.exp', '.lib'):
x = os.path.splitext(dest)[0]+x
if os.path.exists(x):
os.remove(x)
def check_call(self, *args, **kwargs):
"""print cmdline if an error occured
If something is missing (qmake e.g.) you get a non-informative error
self.check_call(qmc + [ext.name+'.pro'])
so you would have to look a the source to see the actual command.
"""
try:
subprocess.check_call(*args, **kwargs)
except:
cmdline = ' '.join(['"%s"' % (arg) if ' ' in arg else arg for arg in args[0]])
print "Error while executing: %s\n" % (cmdline)
raise
def build_qt_objects(self, ext):
obj_pat = 'release\\*.obj' if iswindows else '*.o'
objects = glob.glob(obj_pat)
@ -380,8 +394,8 @@ class Build(Command):
qmc = [QMAKE, '-o', 'Makefile']
if iswindows:
qmc += ['-spec', 'win32-msvc2008']
subprocess.check_call(qmc + [ext.name+'.pro'])
subprocess.check_call([make, '-f', 'Makefile'])
self.check_call(qmc + [ext.name+'.pro'])
self.check_call([make, '-f', 'Makefile'])
objects = glob.glob(obj_pat)
return list(map(self.a, objects))
@ -407,7 +421,7 @@ class Build(Command):
cmd = [pyqt.sip_bin+exe, '-w', '-c', src_dir, '-b', sbf, '-I'+\
pyqt.pyqt_sip_dir] + shlex.split(pyqt.pyqt_sip_flags) + [sipf]
self.info(' '.join(cmd))
subprocess.check_call(cmd)
self.check_call(cmd)
module = self.j(src_dir, self.b(dest))
if self.newer(dest, [sbf]+qt_objects):
mf = self.j(src_dir, 'Makefile')
@ -417,7 +431,7 @@ class Build(Command):
makefile.extra_include_dirs = ext.inc_dirs
makefile.generate()
subprocess.check_call([make, '-f', mf], cwd=src_dir)
self.check_call([make, '-f', mf], cwd=src_dir)
shutil.copy2(module, dest)
def clean(self):
@ -457,7 +471,7 @@ class BuildPDF2XML(Command):
cmd += ['-I'+x for x in poppler_inc_dirs+magick_inc_dirs]
cmd += ['/Fo'+obj, src]
self.info(*cmd)
subprocess.check_call(cmd)
self.check_call(cmd)
objects.append(obj)
if self.newer(dest, objects):
@ -470,7 +484,7 @@ class BuildPDF2XML(Command):
png_libs+magick_libs+poppler_libs+ft_libs+jpg_libs+pdfreflow_libs]
cmd += ['/OUT:'+dest] + objects
self.info(*cmd)
subprocess.check_call(cmd)
self.check_call(cmd)
self.info('Binary installed as', dest)

View File

@ -20,17 +20,23 @@ for x in [
EXCLUDES.extend(['--exclude', x])
SAFE_EXCLUDES = ['"%s"'%x if '*' in x else x for x in EXCLUDES]
def get_rsync_pw():
return open('/home/kovid/work/kde/conf/buildbot').read().partition(
':')[-1].strip()
class Rsync(Command):
description = 'Sync source tree from development machine'
SYNC_CMD = ' '.join(BASE_RSYNC+SAFE_EXCLUDES+
['rsync://{host}/work/{project}', '..'])
['rsync://buildbot@{host}/work/{project}', '..'])
def run(self, opts):
cmd = self.SYNC_CMD.format(host=HOST, project=PROJECT)
env = dict(os.environ)
env['RSYNC_PASSWORD'] = get_rsync_pw()
self.info(cmd)
subprocess.check_call(cmd, shell=True)
subprocess.check_call(cmd, shell=True, env=env)
class Push(Command):
@ -81,7 +87,8 @@ class VMInstaller(Command):
def get_build_script(self):
ans = '\n'.join(self.BUILD_PREFIX)+'\n\n'
rs = ['export RSYNC_PASSWORD=%s'%get_rsync_pw()]
ans = '\n'.join(self.BUILD_PREFIX + rs)+'\n\n'
ans += ' && \\\n'.join(self.BUILD_RSYNC) + ' && \\\n'
ans += ' && \\\n'.join(self.BUILD_CLEAN) + ' && \\\n'
ans += ' && \\\n'.join(self.BUILD_BUILD) + ' && \\\n'

View File

@ -278,6 +278,8 @@ def get_proxies(debug=True):
continue
if proxy.startswith(key+'://'):
proxy = proxy[len(key)+3:]
if key == 'https' and proxy.startswith('http://'):
proxy = proxy[7:]
if proxy.endswith('/'):
proxy = proxy[:-1]
if len(proxy) > 4:

View File

@ -502,6 +502,7 @@ class TXTZMetadataWriter(MetadataWriterPlugin):
# }}}
from calibre.ebooks.comic.input import ComicInput
from calibre.ebooks.djvu.input import DJVUInput
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.html.input import HTMLInput
@ -555,7 +556,8 @@ from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX
from calibre.devices.nook.driver import NOOK, NOOK_COLOR
from calibre.devices.prs505.driver import PRS505, PRST1
from calibre.devices.prs505.driver import PRS505
from calibre.devices.prst1.driver import PRST1
from calibre.devices.user_defined.driver import USER_DEFINED
from calibre.devices.android.driver import ANDROID, S60, WEBOS
from calibre.devices.nokia.driver import N770, N810, E71X, E52
@ -599,6 +601,7 @@ plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
plugins += [
ComicInput,
DJVUInput,
EPUBInput,
FB2Input,
HTMLInput,
@ -1143,6 +1146,16 @@ class StoreAmazonDEKindleStore(StoreBase):
formats = ['KINDLE']
affiliate = True
class StoreAmazonFRKindleStore(StoreBase):
name = 'Amazon FR Kindle'
author = 'Charles Haley'
description = u'Tous les ebooks Kindle'
actual_plugin = 'calibre.gui2.store.stores.amazon_fr_plugin:AmazonFRKindleStore'
headquarters = 'DE'
formats = ['KINDLE']
affiliate = True
class StoreAmazonUKKindleStore(StoreBase):
name = 'Amazon UK Kindle'
author = 'Charles Haley'
@ -1520,6 +1533,7 @@ plugins += [
StoreArchiveOrgStore,
StoreAmazonKindleStore,
StoreAmazonDEKindleStore,
StoreAmazonFRKindleStore,
StoreAmazonUKKindleStore,
StoreBaenWebScriptionStore,
StoreBNStore,

View File

@ -4,7 +4,6 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys
from itertools import izip
from xml.sax.saxutils import escape

View File

@ -217,7 +217,7 @@ class DevicePlugin(Plugin):
'''
Unix version of :meth:`can_handle_windows`
:param device_info: Is a tupe of (vid, pid, bcd, manufacturer, product,
:param device_info: Is a tuple of (vid, pid, bcd, manufacturer, product,
serial number)
'''
@ -414,7 +414,8 @@ class DevicePlugin(Plugin):
@classmethod
def config_widget(cls):
'''
Should return a QWidget. The QWidget contains the settings for the device interface
Should return a QWidget. The QWidget contains the settings for the
device interface
'''
raise NotImplementedError()
@ -429,8 +430,9 @@ class DevicePlugin(Plugin):
@classmethod
def settings(cls):
'''
Should return an opts object. The opts object should have at least one attribute
`format_map` which is an ordered list of formats for the device.
Should return an opts object. The opts object should have at least one
attribute `format_map` which is an ordered list of formats for the
device.
'''
raise NotImplementedError()
@ -516,3 +518,9 @@ class BookList(list):
'''
raise NotImplementedError()
def prepare_addable_books(self, paths):
'''
Given a list of paths, returns another list of paths. These paths
point to addable versions of the books.
'''
return paths

View File

@ -299,34 +299,3 @@ class PRS505(USBMS):
f.write(metadata.thumbnail[-1])
debug_print('Cover uploaded to: %r'%cpath)
class PRST1(USBMS):
name = 'SONY PRST1 and newer Device Interface'
gui_name = 'SONY Reader'
description = _('Communicate with Sony PRST1 and newer eBook readers')
author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux']
FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt']
VENDOR_ID = [0x054c] #: SONY Vendor Id
PRODUCT_ID = [0x05c2]
BCD = [0x226]
VENDOR_NAME = 'SONY'
WINDOWS_MAIN_MEM = re.compile(
r'(PRS-T1&)'
)
THUMBNAIL_HEIGHT = 217
SCAN_FROM_ROOT = True
EBOOK_DIR_MAIN = __appname__
def windows_filter_pnp_id(self, pnp_id):
return '_LAUNCHER' in pnp_id or '_SETTING' in pnp_id
def get_carda_ebook_dir(self, for_upload=False):
if for_upload:
return __appname__
return self.EBOOK_DIR_CARD_A

View File

@ -0,0 +1,7 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,481 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Device driver for the SONY T1 devices
'''
import os, time, re
import sqlite3 as sqlite
from contextlib import closing
from calibre.devices.usbms.driver import USBMS, debug_print
from calibre.devices.usbms.device import USBDevice
from calibre.devices.usbms.books import CollectionsBookList
from calibre.devices.usbms.books import BookList
from calibre.ebooks.metadata import authors_to_sort_string
from calibre.constants import islinux
DBPATH = 'Sony_Reader/database/books.db'
THUMBPATH = 'Sony_Reader/database/cache/books/%s/thumbnail/main_thumbnail.jpg'
class ImageWrapper(object):
def __init__(self, image_path):
self.image_path = image_path
class PRST1(USBMS):
name = 'SONY PRST1 and newer Device Interface'
gui_name = 'SONY Reader'
description = _('Communicate with the PRST1 and newer SONY eBook readers')
author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux']
path_sep = '/'
booklist_class = CollectionsBookList
FORMATS = ['epub', 'pdf', 'txt']
CAN_SET_METADATA = ['collections']
CAN_DO_DEVICE_DB_PLUGBOARD = True
VENDOR_ID = [0x054c] #: SONY Vendor Id
PRODUCT_ID = [0x05c2]
BCD = [0x226]
VENDOR_NAME = 'SONY'
WINDOWS_MAIN_MEM = re.compile(
r'(PRS-T1&)'
)
WINDOWS_CARD_A_MEM = re.compile(
r'(PRS-T1__SD&)'
)
MAIN_MEMORY_VOLUME_LABEL = 'SONY Reader Main Memory'
STORAGE_CARD_VOLUME_LABEL = 'SONY Reader Storage Card'
THUMBNAIL_HEIGHT = 144
SUPPORTS_SUB_DIRS = True
SUPPORTS_USE_AUTHOR_SORT = True
MUST_READ_METADATA = True
EBOOK_DIR_MAIN = 'Sony_Reader/media/books'
EXTRA_CUSTOMIZATION_MESSAGE = [
_('Comma separated list of metadata fields '
'to turn into collections on the device. Possibilities include: ')+\
'series, tags, authors',
_('Upload separate cover thumbnails for books') +
':::'+_('Normally, the SONY readers get the cover image from the'
' ebook file itself. With this option, calibre will send a '
'separate cover image to the reader, useful if you are '
'sending DRMed books in which you cannot change the cover.'),
_('Refresh separate covers when using automatic management') +
':::' +
_('Set this option to have separate book covers uploaded '
'every time you connect your device. Unset this option if '
'you have so many books on the reader that performance is '
'unacceptable.'),
_('Preserve cover aspect ratio when building thumbnails') +
':::' +
_('Set this option if you want the cover thumbnails to have '
'the same aspect ratio (width to height) as the cover. '
'Unset it if you want the thumbnail to be the maximum size, '
'ignoring aspect ratio.'),
]
EXTRA_CUSTOMIZATION_DEFAULT = [
', '.join(['series', 'tags']),
True,
False,
True,
]
OPT_COLLECTIONS = 0
OPT_UPLOAD_COVERS = 1
OPT_REFRESH_COVERS = 2
OPT_PRESERVE_ASPECT_RATIO = 3
plugboards = None
plugboard_func = None
def post_open_callback(self):
# Set the thumbnail width to the theoretical max if the user has asked
# that we do not preserve aspect ratio
if not self.settings().extra_customization[self.OPT_PRESERVE_ASPECT_RATIO]:
self.THUMBNAIL_WIDTH = 108
def windows_filter_pnp_id(self, pnp_id):
return '_LAUNCHER' in pnp_id or '_SETTING' in pnp_id
def get_carda_ebook_dir(self, for_upload=False):
if for_upload:
return self.EBOOK_DIR_MAIN
return self.EBOOK_DIR_CARD_A
def get_main_ebook_dir(self, for_upload=False):
if for_upload:
return self.EBOOK_DIR_MAIN
return ''
def can_handle(self, devinfo, debug=False):
if islinux:
dev = USBDevice(devinfo)
main, carda, cardb = self.find_device_nodes(detected_device=dev)
if main is None and carda is None and cardb is None:
if debug:
print ('\tPRS-T1: Appears to be in non data mode'
' or was ejected, ignoring')
return False
return True
def books(self, oncard=None, end_session=True):
dummy_bl = BookList(None, None, None)
if (
(oncard == 'carda' and not self._card_a_prefix) or
(oncard and oncard != 'carda')
):
self.report_progress(1.0, _('Getting list of books on device...'))
return dummy_bl
prefix = self._card_a_prefix if oncard == 'carda' else self._main_prefix
# Let parent driver get the books
self.booklist_class.rebuild_collections = self.rebuild_collections
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
dbpath = self.normalize_path(prefix + DBPATH)
debug_print("SQLite DB Path: " + dbpath)
with closing(sqlite.connect(dbpath)) as connection:
# Replace undecodable characters in the db instead of erroring out
connection.text_factory = lambda x: unicode(x, "utf-8", "replace")
cursor = connection.cursor()
# Query collections
query = '''
SELECT books._id, collection.title
FROM collections
LEFT OUTER JOIN books
LEFT OUTER JOIN collection
WHERE collections.content_id = books._id AND
collections.collection_id = collection._id
'''
cursor.execute(query)
bl_collections = {}
for i, row in enumerate(cursor):
bl_collections.setdefault(row[0], [])
bl_collections[row[0]].append(row[1])
for idx, book in enumerate(bl):
query = 'SELECT _id, thumbnail FROM books WHERE file_path = ?'
t = (book.lpath,)
cursor.execute (query, t)
for i, row in enumerate(cursor):
book.device_collections = bl_collections.get(row[0], None)
thumbnail = row[1]
if thumbnail is not None:
thumbnail = self.normalize_path(prefix + thumbnail)
book.thumbnail = ImageWrapper(thumbnail)
cursor.close()
return bl
def set_plugboards(self, plugboards, pb_func):
self.plugboards = plugboards
self.plugboard_func = pb_func
def sync_booklists(self, booklists, end_session=True):
debug_print('PRST1: starting sync_booklists')
opts = self.settings()
if opts.extra_customization:
collections = [x.strip() for x in
opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
else:
collections = []
debug_print('PRST1: collection fields:', collections)
if booklists[0] is not None:
self.update_device_database(booklists[0], collections, None)
if booklists[1] is not None:
self.update_device_database(booklists[1], collections, 'carda')
USBMS.sync_booklists(self, booklists, end_session=end_session)
debug_print('PRST1: finished sync_booklists')
def update_device_database(self, booklist, collections_attributes, oncard):
debug_print('PRST1: starting update_device_database')
plugboard = None
if self.plugboard_func:
plugboard = self.plugboard_func(self.__class__.__name__,
'device_db', self.plugboards)
debug_print("PRST1: Using Plugboard", plugboard)
prefix = self._card_a_prefix if oncard == 'carda' else self._main_prefix
if prefix is None:
# Reader has no sd card inserted
return
source_id = 1 if oncard == 'carda' else 0
dbpath = self.normalize_path(prefix + DBPATH)
debug_print("SQLite DB Path: " + dbpath)
collections = booklist.get_collections(collections_attributes)
with closing(sqlite.connect(dbpath)) as connection:
self.update_device_books(connection, booklist, source_id, plugboard)
self.update_device_collections(connection, booklist, collections, source_id)
debug_print('PRST1: finished update_device_database')
def update_device_books(self, connection, booklist, source_id, plugboard):
opts = self.settings()
upload_covers = opts.extra_customization[self.OPT_UPLOAD_COVERS]
refresh_covers = opts.extra_customization[self.OPT_REFRESH_COVERS]
cursor = connection.cursor()
# Get existing books
query = 'SELECT file_path, _id FROM books'
cursor.execute(query)
db_books = {}
for i, row in enumerate(cursor):
lpath = row[0].replace('\\', '/')
db_books[lpath] = row[1]
for book in booklist:
# Run through plugboard if needed
if plugboard is not None:
newmi = book.deepcopy_metadata()
newmi.template_to_attribute(book, plugboard)
else:
newmi = book
# Get Metadata We Want
lpath = book.lpath
try:
if opts.use_author_sort:
if newmi.author_sort:
author = newmi.author_sort
else:
author = authors_to_sort_string(newmi.authors)
else:
author = newmi.authors[0]
except:
author = _('Unknown')
title = newmi.title or _('Unknown')
# Get modified date
modified_date = os.path.getmtime(book.path)
time_offset = time.altzone if time.daylight else time.timezone
modified_date = (modified_date - time_offset) * 1000
if lpath not in db_books:
query = '''
INSERT INTO books
(title, author, source_id, added_date, modified_date,
file_path, file_name, file_size, mime_type, corrupted,
prevent_delete)
values (?,?,?,?,?,?,?,?,?,0,0)
'''
t = (title, author, source_id, int(time.time() * 1000),
modified_date, lpath,
os.path.basename(lpath), book.size, book.mime)
cursor.execute(query, t)
book.bookId = cursor.lastrowid
if upload_covers:
self.upload_book_cover(connection, book, source_id)
debug_print('Inserted New Book: ' + book.title)
else:
query = '''
UPDATE books
SET title = ?, author = ?, modified_date = ?, file_size = ?
WHERE file_path = ?
'''
t = (title, author, modified_date, book.size, lpath)
cursor.execute(query, t)
book.bookId = db_books[lpath]
if refresh_covers:
self.upload_book_cover(connection, book, source_id)
db_books[lpath] = None
for book, bookId in db_books.items():
if bookId is not None:
# Remove From Collections
query = 'DELETE FROM collections WHERE content_id = ?'
t = (bookId,)
cursor.execute(query, t)
# Remove from Books
query = 'DELETE FROM books where _id = ?'
t = (bookId,)
cursor.execute(query, t)
debug_print('Deleted Book:' + book)
connection.commit()
cursor.close()
def update_device_collections(self, connection, booklist, collections,
source_id):
cursor = connection.cursor()
if collections:
# Get existing collections
query = 'SELECT _id, title FROM collection'
cursor.execute(query)
db_collections = {}
for i, row in enumerate(cursor):
db_collections[row[1]] = row[0]
for collection, books in collections.items():
if collection not in db_collections:
query = 'INSERT INTO collection (title, source_id) VALUES (?,?)'
t = (collection, source_id)
cursor.execute(query, t)
db_collections[collection] = cursor.lastrowid
debug_print('Inserted New Collection: ' + collection)
# Get existing books in collection
query = '''
SELECT books.file_path, content_id
FROM collections
LEFT OUTER JOIN books
WHERE collection_id = ? AND books._id = collections.content_id
'''
t = (db_collections[collection],)
cursor.execute(query, t)
db_books = {}
for i, row in enumerate(cursor):
db_books[row[0]] = row[1]
for idx, book in enumerate(books):
if collection not in book.device_collections:
book.device_collections.append(collection)
if db_books.get(book.lpath, None) is None:
query = '''
INSERT INTO collections (collection_id, content_id,
added_order) values (?,?,?)
'''
t = (db_collections[collection], book.bookId, idx)
cursor.execute(query, t)
debug_print('Inserted Book Into Collection: ' +
book.title + ' -> ' + collection)
else:
query = '''
UPDATE collections
SET added_order = ?
WHERE content_id = ? AND collection_id = ?
'''
t = (idx, book.bookId, db_collections[collection])
cursor.execute(query, t)
db_books[book.lpath] = None
for bookPath, bookId in db_books.items():
if bookId is not None:
query = ('DELETE FROM collections '
'WHERE content_id = ? AND collection_id = ? ')
t = (bookId, db_collections[collection],)
cursor.execute(query, t)
debug_print('Deleted Book From Collection: ' + bookPath
+ ' -> ' + collection)
db_collections[collection] = None
for collection, collectionId in db_collections.items():
if collectionId is not None:
# Remove Books from Collection
query = ('DELETE FROM collections '
'WHERE collection_id = ?')
t = (collectionId,)
cursor.execute(query, t)
# Remove Collection
query = ('DELETE FROM collection '
'WHERE _id = ?')
t = (collectionId,)
cursor.execute(query, t)
debug_print('Deleted Collection: ' + collection)
connection.commit()
cursor.close()
def rebuild_collections(self, booklist, oncard):
debug_print('PRST1: starting rebuild_collections')
opts = self.settings()
if opts.extra_customization:
collections = [x.strip() for x in
opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
else:
collections = []
debug_print('PRST1: collection fields:', collections)
self.update_device_database(booklist, collections, oncard)
debug_print('PRS-T1: finished rebuild_collections')
def upload_cover(self, path, filename, metadata, filepath):
debug_print('PRS-T1: uploading cover')
if filepath.startswith(self._main_prefix):
prefix = self._main_prefix
source_id = 0
else:
prefix = self._card_a_prefix
source_id = 1
metadata.lpath = filepath.partition(prefix)[2]
metadata.lpath = metadata.lpath.replace('\\', '/')
dbpath = self.normalize_path(prefix + DBPATH)
debug_print("SQLite DB Path: " + dbpath)
with closing(sqlite.connect(dbpath)) as connection:
cursor = connection.cursor()
query = 'SELECT _id FROM books WHERE file_path = ?'
t = (metadata.lpath,)
cursor.execute(query, t)
for i, row in enumerate(cursor):
metadata.bookId = row[0]
cursor.close()
if getattr(metadata, 'bookId', None) is not None:
debug_print('PRS-T1: refreshing cover for book being sent')
self.upload_book_cover(connection, metadata, source_id)
debug_print('PRS-T1: done uploading cover')
def upload_book_cover(self, connection, book, source_id):
debug_print('PRST1: Uploading/Refreshing Cover for ' + book.title)
if not book.thumbnail or not book.thumbnail[-1]:
return
cursor = connection.cursor()
thumbnail_path = THUMBPATH%book.bookId
prefix = self._main_prefix if source_id is 0 else self._card_a_prefix
thumbnail_file_path = os.path.join(prefix, *thumbnail_path.split('/'))
thumbnail_dir_path = os.path.dirname(thumbnail_file_path)
if not os.path.exists(thumbnail_dir_path):
os.makedirs(thumbnail_dir_path)
with open(thumbnail_file_path, 'wb') as f:
f.write(book.thumbnail[-1])
query = 'UPDATE books SET thumbnail = ? WHERE _id = ?'
t = (thumbnail_path, book.bookId,)
cursor.execute(query, t)
connection.commit()
cursor.close()

View File

@ -483,7 +483,7 @@ class Device(DeviceConfig, DevicePlugin):
self._card_a_prefix = get_card_prefix('carda')
self._card_b_prefix = get_card_prefix('cardb')
def find_device_nodes(self):
def find_device_nodes(self, detected_device=None):
def walk(base):
base = os.path.abspath(os.path.realpath(base))
@ -507,8 +507,11 @@ class Device(DeviceConfig, DevicePlugin):
d, j = os.path.dirname, os.path.join
usb_dir = None
if detected_device is None:
detected_device = self.detected_device
def test(val, attr):
q = getattr(self.detected_device, attr)
q = getattr(detected_device, attr)
return q == val
for x, isfile in walk('/sys/devices'):
@ -596,6 +599,8 @@ class Device(DeviceConfig, DevicePlugin):
label = self.STORAGE_CARD2_VOLUME_LABEL
if not label:
label = self.STORAGE_CARD_VOLUME_LABEL + ' 2'
if not label:
label = 'E-book Reader (%s)'%type
extra = 0
while True:
q = ' (%d)'%extra if extra else ''

View File

@ -0,0 +1,12 @@
#!/usr/bin/env python
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'
'''
Used for DJVU input
'''

View File

@ -0,0 +1,146 @@
#! /usr/bin/env python
# coding: utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <A.van.der.Neut@ruamel.eu>'
# this code is based on:
# Lizardtech DjVu Reference
# DjVu v3
# November 2005
import sys
import struct
from cStringIO import StringIO
from .djvubzzdec import BZZDecoder
class DjvuChunk(object):
def __init__(self, buf, start, end, align=True, bigendian=True,
inclheader=False, verbose=0):
self.subtype = None
self._subchunks = []
self.buf = buf
pos = start + 4
self.type = buf[start:pos]
self.align = align # whether to align to word (2-byte) boundaries
self.headersize = 0 if inclheader else 8
if bigendian:
self.strflag = b'>'
else:
self.strflag = b'<'
oldpos, pos = pos, pos+4
self.size = struct.unpack(self.strflag+b'L', buf[oldpos:pos])[0]
self.dataend = pos + self.size - (8 if inclheader else 0)
if self.type == b'FORM':
oldpos, pos = pos, pos+4
#print oldpos, pos
self.subtype = buf[oldpos:pos]
#self.headersize += 4
self.datastart = pos
if verbose > 0:
print ('found', self.type, self.subtype, pos, self.size)
if self.type in b'FORM'.split():
if verbose > 0:
print ('processing substuff %d %d (%x)' % (pos, self.dataend,
self.dataend))
numchunks = 0
while pos < self.dataend:
x = DjvuChunk(buf, pos, start+self.size, verbose=verbose)
numchunks += 1
self._subchunks.append(x)
newpos = pos + x.size + x.headersize + (1 if (x.size % 2) else 0)
if verbose > 0:
print ('newpos %d %d (%x, %x) %d' % (newpos, self.dataend,
newpos, self.dataend, x.headersize))
pos = newpos
if verbose > 0:
print (' end of chunk %d (%x)' % (pos, pos))
def dump(self, verbose=0, indent=1, out=None, txtout=None, maxlevel=100):
if out:
out.write(b' ' * indent)
out.write(b'%s%s [%d]\n' % (self.type,
b':' + self.subtype if self.subtype else b'', self.size))
if txtout and self.type == b'TXTz':
inbuf = StringIO(self.buf[self.datastart: self.dataend])
outbuf = StringIO()
decoder = BZZDecoder(inbuf, outbuf)
while True:
xxres = decoder.convert(1024 * 1024)
if not xxres:
break
res = outbuf.getvalue()
l = 0
for x in res[:3]:
l <<= 8
l += ord(x)
if verbose > 0 and out:
print >> out, l
txtout.write(res[3:3+l])
txtout.write(b'\n\f')
if txtout and self.type == b'TXTa':
res = self.buf[self.datastart: self.dataend]
l = 0
for x in res[:3]:
l <<= 8
l += ord(x)
if verbose > 0 and out:
print >> out, l
txtout.write(res[3:3+l])
txtout.write(b'\n\f')
if indent >= maxlevel:
return
for schunk in self._subchunks:
schunk.dump(verbose=verbose, indent=indent+1, out=out, txtout=txtout)
class DJVUFile(object):
def __init__(self, instream, verbose=0):
self.instream = instream
buf = self.instream.read(4)
assert(buf == b'AT&T')
buf = self.instream.read()
self.dc = DjvuChunk(buf, 0, len(buf), verbose=verbose)
def get_text(self, outfile=None):
self.dc.dump(txtout=outfile)
def dump(self, outfile=None, maxlevel=0):
self.dc.dump(out=outfile, maxlevel=maxlevel)
def main():
from ruamel.util.program import Program
class DJVUDecoder(Program):
def __init__(self):
Program.__init__(self)
def parser_setup(self):
Program.parser_setup(self)
#self._argparser.add_argument('--combine', '-c', action=CountAction, const=1, nargs=0)
#self._argparser.add_argument('--combine', '-c', type=int, default=1)
#self._argparser.add_argument('--segments', '-s', action='append', nargs='+')
#self._argparser.add_argument('--force', '-f', action='store_true')
#self._argparser.add_argument('classname')
self._argparser.add_argument('--text', '-t', action='store_true')
self._argparser.add_argument('--dump', type=int, default=0)
self._argparser.add_argument('file', nargs='+')
def run(self):
if self._args.verbose > 1: # can be negative with --quiet
print (self._args.file)
x = DJVUFile(file(self._args.file[0], 'rb'), verbose=self._args.verbose)
if self._args.text:
print (x.get_text(sys.stdout))
if self._args.dump:
x.dump(sys.stdout, maxlevel=self._args.dump)
return 0
tt = DJVUDecoder()
res = tt.result
if res != 0:
print (res)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,746 @@
#! /usr/bin/env python
# coding: utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <A.van.der.Neut@ruamel.eu>'
#__docformat__ = 'restructuredtext en'
# Copyright (C) 2011 Anthon van der Neut, Ruamel bvba
# Adapted from Leon Bottou's djvulibre C++ code,
# ( ZPCodec.{cpp,h} and BSByteStream.{cpp,h} )
# that code was first converted to C removing any dependencies on the DJVU libre
# framework for ByteStream, making it into a ctypes callable shared object
# then to python, and remade into a class
original_copyright_notice = '''
//C- -------------------------------------------------------------------
//C- DjVuLibre-3.5
//C- Copyright (c) 2002 Leon Bottou and Yann Le Cun.
//C- Copyright (c) 2001 AT&T
//C-
//C- This software is subject to, and may be distributed under, the
//C- GNU General Public License, either Version 2 of the license,
//C- or (at your option) any later version. The license should have
//C- accompanied the software or you may obtain a copy of the license
//C- from the Free Software Foundation at http://www.fsf.org .
//C-
//C- This program is distributed in the hope that it will be useful,
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//C- GNU General Public License for more details.
//C-
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
//C- Lizardtech Software. Lizardtech Software has authorized us to
//C- replace the original DjVu(r) Reference Library notice by the following
//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
//C-
//C- ------------------------------------------------------------------
//C- | DjVu (r) Reference Library (v. 3.5)
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
//C- | 6,058,214 and patents pending.
//C- |
//C- | This software is subject to, and may be distributed under, the
//C- | GNU General Public License, either Version 2 of the license,
//C- | or (at your option) any later version. The license should have
//C- | accompanied the software or you may obtain a copy of the license
//C- | from the Free Software Foundation at http://www.fsf.org .
//C- |
//C- | The computer code originally released by LizardTech under this
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
//C- | ORIGINAL CODE." Subject to any third party intellectual property
//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
//C- | non-exclusive license to make, use, sell, or otherwise dispose of
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
//C- | General Public License. This grant only confers the right to
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
//C- | the extent such infringement is reasonably necessary to enable
//C- | recipient to make, have made, practice, sell, or otherwise dispose
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
//C- | any greater extent that may be necessary to utilize further
//C- | modifications or combinations.
//C- |
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
//C- +------------------------------------------------------------------
//
// $Id: BSByteStream.cpp,v 1.9 2007/03/25 20:48:29 leonb Exp $
// $Name: release_3_5_23 $
'''
MAXBLOCK = 4096
FREQMAX = 4
CTXIDS = 3
MAXLEN = 1024 ** 2
# Exception classes used by this module.
class BZZDecoderError(Exception):
"""This exception is raised when BZZDecode runs into trouble
"""
def __init__(self, msg):
self.msg = msg
def __str__(self):
return "BZZDecoderError: %s" % (self.msg)
# This table has been designed for the ZPCoder
# * by running the following command in file 'zptable.sn':
# * (fast-crude (steady-mat 0.0035 0.0002) 260)))
default_ztable = [ # {{{
(0x8000, 0x0000, 84, 145), # 000: p=0.500000 ( 0, 0)
(0x8000, 0x0000, 3, 4), # 001: p=0.500000 ( 0, 0)
(0x8000, 0x0000, 4, 3), # 002: p=0.500000 ( 0, 0)
(0x6bbd, 0x10a5, 5, 1), # 003: p=0.465226 ( 0, 0)
(0x6bbd, 0x10a5, 6, 2), # 004: p=0.465226 ( 0, 0)
(0x5d45, 0x1f28, 7, 3), # 005: p=0.430708 ( 0, 0)
(0x5d45, 0x1f28, 8, 4), # 006: p=0.430708 ( 0, 0)
(0x51b9, 0x2bd3, 9, 5), # 007: p=0.396718 ( 0, 0)
(0x51b9, 0x2bd3, 10, 6), # 008: p=0.396718 ( 0, 0)
(0x4813, 0x36e3, 11, 7), # 009: p=0.363535 ( 0, 0)
(0x4813, 0x36e3, 12, 8), # 010: p=0.363535 ( 0, 0)
(0x3fd5, 0x408c, 13, 9), # 011: p=0.331418 ( 0, 0)
(0x3fd5, 0x408c, 14, 10), # 012: p=0.331418 ( 0, 0)
(0x38b1, 0x48fd, 15, 11), # 013: p=0.300585 ( 0, 0)
(0x38b1, 0x48fd, 16, 12), # 014: p=0.300585 ( 0, 0)
(0x3275, 0x505d, 17, 13), # 015: p=0.271213 ( 0, 0)
(0x3275, 0x505d, 18, 14), # 016: p=0.271213 ( 0, 0)
(0x2cfd, 0x56d0, 19, 15), # 017: p=0.243438 ( 0, 0)
(0x2cfd, 0x56d0, 20, 16), # 018: p=0.243438 ( 0, 0)
(0x2825, 0x5c71, 21, 17), # 019: p=0.217391 ( 0, 0)
(0x2825, 0x5c71, 22, 18), # 020: p=0.217391 ( 0, 0)
(0x23ab, 0x615b, 23, 19), # 021: p=0.193150 ( 0, 0)
(0x23ab, 0x615b, 24, 20), # 022: p=0.193150 ( 0, 0)
(0x1f87, 0x65a5, 25, 21), # 023: p=0.170728 ( 0, 0)
(0x1f87, 0x65a5, 26, 22), # 024: p=0.170728 ( 0, 0)
(0x1bbb, 0x6962, 27, 23), # 025: p=0.150158 ( 0, 0)
(0x1bbb, 0x6962, 28, 24), # 026: p=0.150158 ( 0, 0)
(0x1845, 0x6ca2, 29, 25), # 027: p=0.131418 ( 0, 0)
(0x1845, 0x6ca2, 30, 26), # 028: p=0.131418 ( 0, 0)
(0x1523, 0x6f74, 31, 27), # 029: p=0.114460 ( 0, 0)
(0x1523, 0x6f74, 32, 28), # 030: p=0.114460 ( 0, 0)
(0x1253, 0x71e6, 33, 29), # 031: p=0.099230 ( 0, 0)
(0x1253, 0x71e6, 34, 30), # 032: p=0.099230 ( 0, 0)
(0x0fcf, 0x7404, 35, 31), # 033: p=0.085611 ( 0, 0)
(0x0fcf, 0x7404, 36, 32), # 034: p=0.085611 ( 0, 0)
(0x0d95, 0x75d6, 37, 33), # 035: p=0.073550 ( 0, 0)
(0x0d95, 0x75d6, 38, 34), # 036: p=0.073550 ( 0, 0)
(0x0b9d, 0x7768, 39, 35), # 037: p=0.062888 ( 0, 0)
(0x0b9d, 0x7768, 40, 36), # 038: p=0.062888 ( 0, 0)
(0x09e3, 0x78c2, 41, 37), # 039: p=0.053539 ( 0, 0)
(0x09e3, 0x78c2, 42, 38), # 040: p=0.053539 ( 0, 0)
(0x0861, 0x79ea, 43, 39), # 041: p=0.045365 ( 0, 0)
(0x0861, 0x79ea, 44, 40), # 042: p=0.045365 ( 0, 0)
(0x0711, 0x7ae7, 45, 41), # 043: p=0.038272 ( 0, 0)
(0x0711, 0x7ae7, 46, 42), # 044: p=0.038272 ( 0, 0)
(0x05f1, 0x7bbe, 47, 43), # 045: p=0.032174 ( 0, 0)
(0x05f1, 0x7bbe, 48, 44), # 046: p=0.032174 ( 0, 0)
(0x04f9, 0x7c75, 49, 45), # 047: p=0.026928 ( 0, 0)
(0x04f9, 0x7c75, 50, 46), # 048: p=0.026928 ( 0, 0)
(0x0425, 0x7d0f, 51, 47), # 049: p=0.022444 ( 0, 0)
(0x0425, 0x7d0f, 52, 48), # 050: p=0.022444 ( 0, 0)
(0x0371, 0x7d91, 53, 49), # 051: p=0.018636 ( 0, 0)
(0x0371, 0x7d91, 54, 50), # 052: p=0.018636 ( 0, 0)
(0x02d9, 0x7dfe, 55, 51), # 053: p=0.015421 ( 0, 0)
(0x02d9, 0x7dfe, 56, 52), # 054: p=0.015421 ( 0, 0)
(0x0259, 0x7e5a, 57, 53), # 055: p=0.012713 ( 0, 0)
(0x0259, 0x7e5a, 58, 54), # 056: p=0.012713 ( 0, 0)
(0x01ed, 0x7ea6, 59, 55), # 057: p=0.010419 ( 0, 0)
(0x01ed, 0x7ea6, 60, 56), # 058: p=0.010419 ( 0, 0)
(0x0193, 0x7ee6, 61, 57), # 059: p=0.008525 ( 0, 0)
(0x0193, 0x7ee6, 62, 58), # 060: p=0.008525 ( 0, 0)
(0x0149, 0x7f1a, 63, 59), # 061: p=0.006959 ( 0, 0)
(0x0149, 0x7f1a, 64, 60), # 062: p=0.006959 ( 0, 0)
(0x010b, 0x7f45, 65, 61), # 063: p=0.005648 ( 0, 0)
(0x010b, 0x7f45, 66, 62), # 064: p=0.005648 ( 0, 0)
(0x00d5, 0x7f6b, 67, 63), # 065: p=0.004506 ( 0, 0)
(0x00d5, 0x7f6b, 68, 64), # 066: p=0.004506 ( 0, 0)
(0x00a5, 0x7f8d, 69, 65), # 067: p=0.003480 ( 0, 0)
(0x00a5, 0x7f8d, 70, 66), # 068: p=0.003480 ( 0, 0)
(0x007b, 0x7faa, 71, 67), # 069: p=0.002602 ( 0, 0)
(0x007b, 0x7faa, 72, 68), # 070: p=0.002602 ( 0, 0)
(0x0057, 0x7fc3, 73, 69), # 071: p=0.001843 ( 0, 0)
(0x0057, 0x7fc3, 74, 70), # 072: p=0.001843 ( 0, 0)
(0x003b, 0x7fd7, 75, 71), # 073: p=0.001248 ( 0, 0)
(0x003b, 0x7fd7, 76, 72), # 074: p=0.001248 ( 0, 0)
(0x0023, 0x7fe7, 77, 73), # 075: p=0.000749 ( 0, 0)
(0x0023, 0x7fe7, 78, 74), # 076: p=0.000749 ( 0, 0)
(0x0013, 0x7ff2, 79, 75), # 077: p=0.000402 ( 0, 0)
(0x0013, 0x7ff2, 80, 76), # 078: p=0.000402 ( 0, 0)
(0x0007, 0x7ffa, 81, 77), # 079: p=0.000153 ( 0, 0)
(0x0007, 0x7ffa, 82, 78), # 080: p=0.000153 ( 0, 0)
(0x0001, 0x7fff, 81, 79), # 081: p=0.000027 ( 0, 0)
(0x0001, 0x7fff, 82, 80), # 082: p=0.000027 ( 0, 0)
(0x5695, 0x0000, 9, 85), # 083: p=0.411764 ( 2, 3)
(0x24ee, 0x0000, 86, 226), # 084: p=0.199988 ( 1, 0)
(0x8000, 0x0000, 5, 6), # 085: p=0.500000 ( 3, 3)
(0x0d30, 0x0000, 88, 176), # 086: p=0.071422 ( 4, 0)
(0x481a, 0x0000, 89, 143), # 087: p=0.363634 ( 1, 2)
(0x0481, 0x0000, 90, 138), # 088: p=0.024388 ( 13, 0)
(0x3579, 0x0000, 91, 141), # 089: p=0.285711 ( 1, 3)
(0x017a, 0x0000, 92, 112), # 090: p=0.007999 ( 41, 0)
(0x24ef, 0x0000, 93, 135), # 091: p=0.199997 ( 1, 5)
(0x007b, 0x0000, 94, 104), # 092: p=0.002611 ( 127, 0)
(0x1978, 0x0000, 95, 133), # 093: p=0.137929 ( 1, 8)
(0x0028, 0x0000, 96, 100), # 094: p=0.000849 ( 392, 0)
(0x10ca, 0x0000, 97, 129), # 095: p=0.090907 ( 1, 13)
(0x000d, 0x0000, 82, 98), # 096: p=0.000276 ( 1208, 0)
(0x0b5d, 0x0000, 99, 127), # 097: p=0.061537 ( 1, 20)
(0x0034, 0x0000, 76, 72), # 098: p=0.001102 ( 1208, 1)
(0x078a, 0x0000, 101, 125), # 099: p=0.040815 ( 1, 31)
(0x00a0, 0x0000, 70, 102), # 100: p=0.003387 ( 392, 1)
(0x050f, 0x0000, 103, 123), # 101: p=0.027397 ( 1, 47)
(0x0117, 0x0000, 66, 60), # 102: p=0.005912 ( 392, 2)
(0x0358, 0x0000, 105, 121), # 103: p=0.018099 ( 1, 72)
(0x01ea, 0x0000, 106, 110), # 104: p=0.010362 ( 127, 1)
(0x0234, 0x0000, 107, 119), # 105: p=0.011940 ( 1, 110)
(0x0144, 0x0000, 66, 108), # 106: p=0.006849 ( 193, 1)
(0x0173, 0x0000, 109, 117), # 107: p=0.007858 ( 1, 168)
(0x0234, 0x0000, 60, 54), # 108: p=0.011925 ( 193, 2)
(0x00f5, 0x0000, 111, 115), # 109: p=0.005175 ( 1, 256)
(0x0353, 0x0000, 56, 48), # 110: p=0.017995 ( 127, 2)
(0x00a1, 0x0000, 69, 113), # 111: p=0.003413 ( 1, 389)
(0x05c5, 0x0000, 114, 134), # 112: p=0.031249 ( 41, 1)
(0x011a, 0x0000, 65, 59), # 113: p=0.005957 ( 2, 389)
(0x03cf, 0x0000, 116, 132), # 114: p=0.020618 ( 63, 1)
(0x01aa, 0x0000, 61, 55), # 115: p=0.009020 ( 2, 256)
(0x0285, 0x0000, 118, 130), # 116: p=0.013652 ( 96, 1)
(0x0286, 0x0000, 57, 51), # 117: p=0.013672 ( 2, 168)
(0x01ab, 0x0000, 120, 128), # 118: p=0.009029 ( 146, 1)
(0x03d3, 0x0000, 53, 47), # 119: p=0.020710 ( 2, 110)
(0x011a, 0x0000, 122, 126), # 120: p=0.005961 ( 222, 1)
(0x05c5, 0x0000, 49, 41), # 121: p=0.031250 ( 2, 72)
(0x00ba, 0x0000, 124, 62), # 122: p=0.003925 ( 338, 1)
(0x08ad, 0x0000, 43, 37), # 123: p=0.046979 ( 2, 47)
(0x007a, 0x0000, 72, 66), # 124: p=0.002586 ( 514, 1)
(0x0ccc, 0x0000, 39, 31), # 125: p=0.069306 ( 2, 31)
(0x01eb, 0x0000, 60, 54), # 126: p=0.010386 ( 222, 2)
(0x1302, 0x0000, 33, 25), # 127: p=0.102940 ( 2, 20)
(0x02e6, 0x0000, 56, 50), # 128: p=0.015695 ( 146, 2)
(0x1b81, 0x0000, 29, 131), # 129: p=0.148935 ( 2, 13)
(0x045e, 0x0000, 52, 46), # 130: p=0.023648 ( 96, 2)
(0x24ef, 0x0000, 23, 17), # 131: p=0.199999 ( 3, 13)
(0x0690, 0x0000, 48, 40), # 132: p=0.035533 ( 63, 2)
(0x2865, 0x0000, 23, 15), # 133: p=0.218748 ( 2, 8)
(0x09de, 0x0000, 42, 136), # 134: p=0.053434 ( 41, 2)
(0x3987, 0x0000, 137, 7), # 135: p=0.304346 ( 2, 5)
(0x0dc8, 0x0000, 38, 32), # 136: p=0.074626 ( 41, 3)
(0x2c99, 0x0000, 21, 139), # 137: p=0.241378 ( 2, 7)
(0x10ca, 0x0000, 140, 172), # 138: p=0.090907 ( 13, 1)
(0x3b5f, 0x0000, 15, 9), # 139: p=0.312499 ( 3, 7)
(0x0b5d, 0x0000, 142, 170), # 140: p=0.061537 ( 20, 1)
(0x5695, 0x0000, 9, 85), # 141: p=0.411764 ( 2, 3)
(0x078a, 0x0000, 144, 168), # 142: p=0.040815 ( 31, 1)
(0x8000, 0x0000, 141, 248), # 143: p=0.500000 ( 2, 2)
(0x050f, 0x0000, 146, 166), # 144: p=0.027397 ( 47, 1)
(0x24ee, 0x0000, 147, 247), # 145: p=0.199988 ( 0, 1)
(0x0358, 0x0000, 148, 164), # 146: p=0.018099 ( 72, 1)
(0x0d30, 0x0000, 149, 197), # 147: p=0.071422 ( 0, 4)
(0x0234, 0x0000, 150, 162), # 148: p=0.011940 ( 110, 1)
(0x0481, 0x0000, 151, 95), # 149: p=0.024388 ( 0, 13)
(0x0173, 0x0000, 152, 160), # 150: p=0.007858 ( 168, 1)
(0x017a, 0x0000, 153, 173), # 151: p=0.007999 ( 0, 41)
(0x00f5, 0x0000, 154, 158), # 152: p=0.005175 ( 256, 1)
(0x007b, 0x0000, 155, 165), # 153: p=0.002611 ( 0, 127)
(0x00a1, 0x0000, 70, 156), # 154: p=0.003413 ( 389, 1)
(0x0028, 0x0000, 157, 161), # 155: p=0.000849 ( 0, 392)
(0x011a, 0x0000, 66, 60), # 156: p=0.005957 ( 389, 2)
(0x000d, 0x0000, 81, 159), # 157: p=0.000276 ( 0, 1208)
(0x01aa, 0x0000, 62, 56), # 158: p=0.009020 ( 256, 2)
(0x0034, 0x0000, 75, 71), # 159: p=0.001102 ( 1, 1208)
(0x0286, 0x0000, 58, 52), # 160: p=0.013672 ( 168, 2)
(0x00a0, 0x0000, 69, 163), # 161: p=0.003387 ( 1, 392)
(0x03d3, 0x0000, 54, 48), # 162: p=0.020710 ( 110, 2)
(0x0117, 0x0000, 65, 59), # 163: p=0.005912 ( 2, 392)
(0x05c5, 0x0000, 50, 42), # 164: p=0.031250 ( 72, 2)
(0x01ea, 0x0000, 167, 171), # 165: p=0.010362 ( 1, 127)
(0x08ad, 0x0000, 44, 38), # 166: p=0.046979 ( 47, 2)
(0x0144, 0x0000, 65, 169), # 167: p=0.006849 ( 1, 193)
(0x0ccc, 0x0000, 40, 32), # 168: p=0.069306 ( 31, 2)
(0x0234, 0x0000, 59, 53), # 169: p=0.011925 ( 2, 193)
(0x1302, 0x0000, 34, 26), # 170: p=0.102940 ( 20, 2)
(0x0353, 0x0000, 55, 47), # 171: p=0.017995 ( 2, 127)
(0x1b81, 0x0000, 30, 174), # 172: p=0.148935 ( 13, 2)
(0x05c5, 0x0000, 175, 193), # 173: p=0.031249 ( 1, 41)
(0x24ef, 0x0000, 24, 18), # 174: p=0.199999 ( 13, 3)
(0x03cf, 0x0000, 177, 191), # 175: p=0.020618 ( 1, 63)
(0x2b74, 0x0000, 178, 222), # 176: p=0.235291 ( 4, 1)
(0x0285, 0x0000, 179, 189), # 177: p=0.013652 ( 1, 96)
(0x201d, 0x0000, 180, 218), # 178: p=0.173910 ( 6, 1)
(0x01ab, 0x0000, 181, 187), # 179: p=0.009029 ( 1, 146)
(0x1715, 0x0000, 182, 216), # 180: p=0.124998 ( 9, 1)
(0x011a, 0x0000, 183, 185), # 181: p=0.005961 ( 1, 222)
(0x0fb7, 0x0000, 184, 214), # 182: p=0.085105 ( 14, 1)
(0x00ba, 0x0000, 69, 61), # 183: p=0.003925 ( 1, 338)
(0x0a67, 0x0000, 186, 212), # 184: p=0.056337 ( 22, 1)
(0x01eb, 0x0000, 59, 53), # 185: p=0.010386 ( 2, 222)
(0x06e7, 0x0000, 188, 210), # 186: p=0.037382 ( 34, 1)
(0x02e6, 0x0000, 55, 49), # 187: p=0.015695 ( 2, 146)
(0x0496, 0x0000, 190, 208), # 188: p=0.024844 ( 52, 1)
(0x045e, 0x0000, 51, 45), # 189: p=0.023648 ( 2, 96)
(0x030d, 0x0000, 192, 206), # 190: p=0.016529 ( 79, 1)
(0x0690, 0x0000, 47, 39), # 191: p=0.035533 ( 2, 63)
(0x0206, 0x0000, 194, 204), # 192: p=0.010959 ( 120, 1)
(0x09de, 0x0000, 41, 195), # 193: p=0.053434 ( 2, 41)
(0x0155, 0x0000, 196, 202), # 194: p=0.007220 ( 183, 1)
(0x0dc8, 0x0000, 37, 31), # 195: p=0.074626 ( 3, 41)
(0x00e1, 0x0000, 198, 200), # 196: p=0.004750 ( 279, 1)
(0x2b74, 0x0000, 199, 243), # 197: p=0.235291 ( 1, 4)
(0x0094, 0x0000, 72, 64), # 198: p=0.003132 ( 424, 1)
(0x201d, 0x0000, 201, 239), # 199: p=0.173910 ( 1, 6)
(0x0188, 0x0000, 62, 56), # 200: p=0.008284 ( 279, 2)
(0x1715, 0x0000, 203, 237), # 201: p=0.124998 ( 1, 9)
(0x0252, 0x0000, 58, 52), # 202: p=0.012567 ( 183, 2)
(0x0fb7, 0x0000, 205, 235), # 203: p=0.085105 ( 1, 14)
(0x0383, 0x0000, 54, 48), # 204: p=0.019021 ( 120, 2)
(0x0a67, 0x0000, 207, 233), # 205: p=0.056337 ( 1, 22)
(0x0547, 0x0000, 50, 44), # 206: p=0.028571 ( 79, 2)
(0x06e7, 0x0000, 209, 231), # 207: p=0.037382 ( 1, 34)
(0x07e2, 0x0000, 46, 38), # 208: p=0.042682 ( 52, 2)
(0x0496, 0x0000, 211, 229), # 209: p=0.024844 ( 1, 52)
(0x0bc0, 0x0000, 40, 34), # 210: p=0.063636 ( 34, 2)
(0x030d, 0x0000, 213, 227), # 211: p=0.016529 ( 1, 79)
(0x1178, 0x0000, 36, 28), # 212: p=0.094593 ( 22, 2)
(0x0206, 0x0000, 215, 225), # 213: p=0.010959 ( 1, 120)
(0x19da, 0x0000, 30, 22), # 214: p=0.139999 ( 14, 2)
(0x0155, 0x0000, 217, 223), # 215: p=0.007220 ( 1, 183)
(0x24ef, 0x0000, 26, 16), # 216: p=0.199998 ( 9, 2)
(0x00e1, 0x0000, 219, 221), # 217: p=0.004750 ( 1, 279)
(0x320e, 0x0000, 20, 220), # 218: p=0.269229 ( 6, 2)
(0x0094, 0x0000, 71, 63), # 219: p=0.003132 ( 1, 424)
(0x432a, 0x0000, 14, 8), # 220: p=0.344827 ( 6, 3)
(0x0188, 0x0000, 61, 55), # 221: p=0.008284 ( 2, 279)
(0x447d, 0x0000, 14, 224), # 222: p=0.349998 ( 4, 2)
(0x0252, 0x0000, 57, 51), # 223: p=0.012567 ( 2, 183)
(0x5ece, 0x0000, 8, 2), # 224: p=0.434782 ( 4, 3)
(0x0383, 0x0000, 53, 47), # 225: p=0.019021 ( 2, 120)
(0x8000, 0x0000, 228, 87), # 226: p=0.500000 ( 1, 1)
(0x0547, 0x0000, 49, 43), # 227: p=0.028571 ( 2, 79)
(0x481a, 0x0000, 230, 246), # 228: p=0.363634 ( 2, 1)
(0x07e2, 0x0000, 45, 37), # 229: p=0.042682 ( 2, 52)
(0x3579, 0x0000, 232, 244), # 230: p=0.285711 ( 3, 1)
(0x0bc0, 0x0000, 39, 33), # 231: p=0.063636 ( 2, 34)
(0x24ef, 0x0000, 234, 238), # 232: p=0.199997 ( 5, 1)
(0x1178, 0x0000, 35, 27), # 233: p=0.094593 ( 2, 22)
(0x1978, 0x0000, 138, 236), # 234: p=0.137929 ( 8, 1)
(0x19da, 0x0000, 29, 21), # 235: p=0.139999 ( 2, 14)
(0x2865, 0x0000, 24, 16), # 236: p=0.218748 ( 8, 2)
(0x24ef, 0x0000, 25, 15), # 237: p=0.199998 ( 2, 9)
(0x3987, 0x0000, 240, 8), # 238: p=0.304346 ( 5, 2)
(0x320e, 0x0000, 19, 241), # 239: p=0.269229 ( 2, 6)
(0x2c99, 0x0000, 22, 242), # 240: p=0.241378 ( 7, 2)
(0x432a, 0x0000, 13, 7), # 241: p=0.344827 ( 3, 6)
(0x3b5f, 0x0000, 16, 10), # 242: p=0.312499 ( 7, 3)
(0x447d, 0x0000, 13, 245), # 243: p=0.349998 ( 2, 4)
(0x5695, 0x0000, 10, 2), # 244: p=0.411764 ( 3, 2)
(0x5ece, 0x0000, 7, 1), # 245: p=0.434782 ( 3, 4)
(0x8000, 0x0000, 244, 83), # 246: p=0.500000 ( 2, 2)
(0x8000, 0x0000, 249, 250), # 247: p=0.500000 ( 1, 1)
(0x5695, 0x0000, 10, 2), # 248: p=0.411764 ( 3, 2)
(0x481a, 0x0000, 89, 143), # 249: p=0.363634 ( 1, 2)
(0x481a, 0x0000, 230, 246), # 250: p=0.363634 ( 2, 1)
(0, 0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0),
]
xmtf = (
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
)
# }}}
def chr3(l):
return bytes(bytearray(l))
class BZZDecoder():
def __init__(self, infile, outfile):
self.instream = infile
self.outf = outfile
self.ieof = False
self.bptr = None
self.xsize = None
self.outbuf = [0] * (MAXBLOCK * 1024)
self.byte = None
self.scount = 0
self.delay = 25
self.a = 0
self.code = 0
self.bufint = 0
self.ctx = [0] * 300
# table
self.p = [0] * 256
self.m = [0] * 256
self.up = [0] * 256
self.dn = [0] * 256
# machine independent ffz
self.ffzt = [0] * 256
# Create machine independent ffz table
for i in range(256):
j = i
while(j & 0x80):
self.ffzt[i] += 1
j <<= 1
# Initialize table
self.newtable(default_ztable)
# Codebit counter
# Read first 16 bits of code
if not self.read_byte():
self.byte = 0xff
self.code = (self.byte << 8)
if not self.read_byte():
self.byte = 0xff
self.code = self.code | self.byte
# Preload buffer
self.preload()
# Compute initial fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
def convert(self, sz):
if self.ieof:
return 0
copied = 0
while sz > 0 and not (self.ieof):
# Decode if needed
if not self.xsize:
self.bptr = 0
if not self.decode(): # input block size set in decode
self.xsize = 1
self.ieof = True
self.xsize -= 1
# Compute remaining
bytes = self.xsize
if bytes > sz:
bytes = sz
# Transfer
if bytes:
for i in range(bytes):
self.outf.write(chr3(self.outbuf[self.bptr + i]))
self.xsize -= bytes
self.bptr += bytes
sz -= bytes
copied += bytes
# offset += bytes; // for tell()
return copied
def preload(self):
while self.scount <= 24:
if self.read_byte() < 1:
self.byte = 0xff
if --self.delay < 1:
raise BZZDecoderError("BiteStream EOF")
self.bufint = (self.bufint << 8) | self.byte
self.scount += 8
def newtable(self, table):
for i in range(256):
self.p[i] = table[i][0]
self.m[i] = table[i][1]
self.up[i] = table[i][2]
self.dn[i] = table[i][3]
def decode(self):
outbuf = self.outbuf
# Decode block size
self.xsize = self.decode_raw(24)
if not self.xsize:
return 0
if self.xsize > MAXBLOCK * 1024: # 4MB (4096 * 1024) is max block
raise BZZDecoderError("BiteStream.corrupt")
# Dec11ode Estimation Speed
fshift = 0
if self.zpcodec_decoder():
fshift += 1
if self.zpcodec_decoder():
fshift += 1
# Prepare Quasi MTF
mtf = list(xmtf) # unsigned chars
freq = [0] * FREQMAX
fadd = 4
# Decode
mtfno = 3
markerpos = -1
for i in range(self.xsize):
ctxid = CTXIDS - 1
if ctxid > mtfno:
ctxid = mtfno
cx = self.ctx
if self.zpcodec_decode(cx, ctxid):
mtfno = 0
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, ctxid + CTXIDS):
mtfno = 1
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS):
mtfno = 2 + self.decode_binary(cx, 2*CTXIDS + 1, 1)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS+2):
mtfno = 4 + self.decode_binary(cx, 2*CTXIDS+2 + 1, 2)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 6):
mtfno = 8 + self.decode_binary(cx, 2*CTXIDS + 6 + 1, 3)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 14):
mtfno = 16 + self.decode_binary(cx, 2*CTXIDS + 14 + 1, 4)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 30 ):
mtfno = 32 + self.decode_binary(cx, 2*CTXIDS + 30 + 1, 5)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 62 ):
mtfno = 64 + self.decode_binary(cx, 2*CTXIDS + 62 + 1, 6)
outbuf[i] = mtf[mtfno]
elif self.zpcodec_decode(cx, 2*CTXIDS + 126):
mtfno = 128 + self.decode_binary(cx, 2*CTXIDS + 126 + 1, 7)
outbuf[i] = mtf[mtfno]
else:
mtfno = 256 # EOB
outbuf[i] = 0
markerpos = i
continue
# Rotate mtf according to empirical frequencies (new!)
# :rotate label
# Adjust frequencies for overflow
fadd = fadd + (fadd >> fshift)
if fadd > 0x10000000:
fadd >>= 24
freq[0] >>= 24
freq[1] >>= 24
freq[2] >>= 24
freq[3] >>= 24
for k in range(4, FREQMAX):
freq[k] = freq[k] >> 24
# Relocate new char according to new freq
fc = fadd
if mtfno < FREQMAX:
fc += freq[mtfno]
k = mtfno
while (k >= FREQMAX):
mtf[k] = mtf[k - 1]
k -= 1
while (k > 0 and fc >= freq[k - 1]):
mtf[k] = mtf[k - 1]
freq[k] = freq[k - 1]
k -= 1
mtf[k] = outbuf[i]
freq[k] = fc
#///////////////////////////////
#//////// Reconstruct the string
if markerpos < 1 or markerpos >= self.xsize:
raise BZZDecoderError("BiteStream.corrupt")
# Allocate pointers
posn = [0] * self.xsize
# Prepare count buffer
count = [0] * 256
# Fill count buffer
for i in range(markerpos):
c = outbuf[i]
posn[i] = (c << 24) | (count[c] & 0xffffff)
count[c] += 1
for i in range(markerpos + 1, self.xsize):
c = outbuf[i]
posn[i] = (c << 24) | (count[c] & 0xffffff)
count[c] += 1
# Compute sorted char positions
last = 1
for i in range(256):
tmp = count[i]
count[i] = last
last += tmp
# Undo the sort transform
i = 0
last = self.xsize - 1
while last > 0:
n = posn[i]
c = (posn[i] >> 24)
last -= 1
outbuf[last] = c
i = count[c] + (n & 0xffffff)
# Free and check
if i != markerpos:
raise BZZDecoderError("BiteStream.corrupt")
return self.xsize
def decode_raw(self, bits):
n = 1
m = (1 << bits)
while n < m:
b = self.zpcodec_decoder()
n = (n << 1) | b
return n - m
def decode_binary(self, ctx, index, bits):
n = 1
m = (1 << bits)
while n < m:
b = self.zpcodec_decode(ctx, index + n - 1)
n = (n << 1) | b
return n - m
def zpcodec_decoder(self):
return self.decode_sub_simple(0, 0x8000 + (self.a >> 1))
def decode_sub_simple(self, mps, z):
# Test MPS/LPS
if z > self.code:
# LPS branch
z = 0x10000 - z
self.a += +z
self.code = self.code + z
# LPS renormalization
shift = self.ffz()
self.scount -= shift
self.a = self.a << shift
self.a &= 0xffff
self.code = (self.code << shift) | ((self.bufint >> self.scount) & ((1 << shift) - 1))
self.code &= 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
result = mps ^ 1
else:
# MPS renormalization
self.scount -= 1
self.a = (z << 1) & 0xffff
self.code = ((self.code << 1) | ((self.bufint >> self.scount) & 1))
self.code &= 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
result = mps
return result
def decode_sub(self, ctx, index, z):
# Save bit
bit = (ctx[index] & 1)
# Avoid interval reversion
d = 0x6000 + ((z + self.a) >> 2)
if z > d:
z = d
# Test MPS/LPS
if z > self.code:
# LPS branch
z = 0x10000 - z
self.a += +z
self.code = self.code + z
# LPS adaptation
ctx[index] = self.dn[ctx[index]]
# LPS renormalization
shift = self.ffz()
self.scount -= shift
self.a = (self.a << shift) & 0xffff
self.code = ((self.code << shift) | ((self.bufint >> self.scount) & ((1 << shift) - 1))) & 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
return bit ^ 1
else:
# MPS adaptation
if self.a >= self.m[ctx[index]]:
ctx[index] = self.up[ctx[index]]
# MPS renormalization
self.scount -= 1
self.a = z << 1 & 0xffff
self.code = ((self.code << 1) | ((self.bufint >> self.scount) & 1)) & 0xffff
if self.scount < 16:
self.preload()
# Adjust fence
self.fence = self.code
if self.code >= 0x8000:
self.fence = 0x7fff
return bit
def zpcodec_decode(self, ctx, index):
z = self.a + self.p[ctx[index]]
if z <= self.fence:
self.a = z
res = (ctx[index] & 1)
else:
res = self.decode_sub(ctx, index, z)
return res
def read_byte(self):
res = 0
if self.instream:
ires = self.instream.read(1)
res = len(ires)
if res:
self.byte = ord(ires[0])
else:
raise NotImplementedError
return res
def ffz(self):
x = self.a
if (x >= 0xff00):
return (self.ffzt[x & 0xff] + 8)
else:
return (self.ffzt[(x >> 8) & 0xff])
### for testing
def main():
import sys
infile = file(sys.argv[1], "rb")
outfile = file(sys.argv[2], "wb")
dec = BZZDecoder(infile, outfile)
while True:
res = dec.convert(1024 * 1024)
if not res:
break
if __name__ == "__main__":
main()

View File

@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'
import os
from subprocess import Popen, PIPE
from cStringIO import StringIO
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic
class DJVUInput(InputFormatPlugin):
name = 'DJVU Input'
author = 'Anthon van der Neut'
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
file_types = set(['djvu', 'djv'])
options = set([
OptionRecommendation(name='use_djvutxt', recommended_value=True,
help=_('Try to use the djvutxt program and fall back to pure '
'python implementation if it fails or is not available')),
])
def convert(self, stream, options, file_ext, log, accelerators):
stdout = StringIO()
ppdjvu = True
# using djvutxt is MUCH faster, should make it an option
if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'):
from calibre.ptempfile import PersistentTemporaryFile
try:
fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input')
filename = fp._name
fp.write(stream.read())
fp.close()
cmd = ['djvutxt', filename]
stdout.write(Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0])
os.remove(filename)
ppdjvu = False
except:
stream.seek(0) # retry with the pure python converter
if ppdjvu:
from .djvu import DJVUFile
x = DJVUFile(stream)
x.get_text(stdout)
html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace(
b'\037', b'\n\n'))
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = os.getcwdu()
if file_ext != 'txtz' and hasattr(stream, 'name'):
base = os.path.dirname(stream.name)
fname = os.path.join(base, 'index.html')
c = 0
while os.path.exists(fname):
c += 1
fname = 'index%d.html'%c
htmlfile = open(fname, 'wb')
with htmlfile:
htmlfile.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile.name, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile.name)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb

View File

@ -246,6 +246,7 @@ class CSSFlattener(object):
cssdict['font-size'] = '%.1fpt'%font_size
del node.attrib['size']
if 'face' in node.attrib:
cssdict['font-family'] = node.attrib['face']
del node.attrib['face']
if 'color' in node.attrib:
cssdict['color'] = node.attrib['color']

View File

@ -397,6 +397,7 @@ class AddAction(InterfaceAction):
d = error_dialog(self.gui, _('Add to library'), _('No book files found'))
d.exec_()
return
paths = self.gui.device_manager.device.prepare_addable_books(paths)
from calibre.gui2.add import Adder
self.__adder_func = partial(self._add_from_device_adder, on_card=None,
model=view.model())

View File

@ -0,0 +1,24 @@
# coding: utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Anthon van der Neut <A.van.der.Neut@ruamel.eu>'
from calibre.gui2.convert.djvu_input_ui import Ui_Form
from calibre.gui2.convert import Widget
class PluginWidget(Widget, Ui_Form):
TITLE = _('DJVU Input')
HELP = _('Options specific to')+' DJVU '+_('input')
COMMIT_NAME = 'djvu_input'
ICON = I('mimetypes/djvu.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['use_djvutxt', ])
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Form</class>
<widget class="QWidget" name="Form">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<height>300</height>
</rect>
</property>
<property name="windowTitle">
<string>Form</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QCheckBox" name="opt_use_djvutxt">
<property name="text">
<string>Use &amp;djvutxt, if available, for faster processing</string>
</property>
</widget>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>

View File

@ -538,14 +538,20 @@ class CoversModel(QAbstractListModel): # {{{
current_cover = QPixmap(I('default_cover.png'))
self.blank = QPixmap(I('blank.png')).scaled(150, 200)
self.cc = current_cover
self.reset_covers(do_reset=False)
self.covers = [self.get_item(_('Current cover'), current_cover)]
def reset_covers(self, do_reset=True):
self.covers = [self.get_item(_('Current cover'), self.cc)]
self.plugin_map = {}
for i, plugin in enumerate(metadata_plugins(['cover'])):
self.covers.append((plugin.name+'\n'+_('Searching...'),
QVariant(self.blank), None, True))
self.plugin_map[plugin] = i+1
if do_reset:
self.reset()
def get_item(self, src, pmap, waiting=False):
sz = '%dx%d'%(pmap.width(), pmap.height())
text = QVariant(src + '\n' + sz)
@ -654,6 +660,9 @@ class CoversView(QListView): # {{{
self.select(0)
self.delegate.start_animation()
def reset_covers(self):
self.m.reset_covers()
def clear_failed(self):
plugin = self.m.plugin_for_index(self.currentIndex())
self.m.clear_failed()
@ -683,12 +692,18 @@ class CoversWidget(QWidget): # {{{
l.addWidget(self.covers_view, 1, 0)
self.continue_processing = True
def reset_covers(self):
self.covers_view.reset_covers()
def start(self, book, current_cover, title, authors):
self.continue_processing = True
self.abort.clear()
self.book, self.current_cover = book, current_cover
self.title, self.authors = title, authors
self.log('Starting cover download for:', book.title)
self.log('Query:', title, authors, self.book.identifiers)
self.msg.setText('<p>'+_('Downloading covers for <b>%s</b>, please wait...')%book.title)
self.msg.setText('<p>'+
_('Downloading covers for <b>%s</b>, please wait...')%book.title)
self.covers_view.start()
self.worker = CoverWorker(self.log, self.abort, self.title,
@ -726,8 +741,9 @@ class CoversWidget(QWidget): # {{{
if num < 2:
txt = _('Could not find any covers for <b>%s</b>')%self.book.title
else:
txt = _('Found <b>%(num)d</b> covers of %(title)s. Pick the one you like'
' best.')%dict(num=num-1, title=self.title)
txt = _('Found <b>%(num)d</b> covers of %(title)s. '
'Pick the one you like best.')%dict(num=num-1,
title=self.title)
self.msg.setText(txt)
self.finished.emit()
@ -832,10 +848,14 @@ class FullFetch(QDialog): # {{{
self.next_button.clicked.connect(self.next_clicked)
self.ok_button = self.bb.button(self.bb.Ok)
self.ok_button.clicked.connect(self.ok_clicked)
self.prev_button = self.bb.addButton(_('Back'), self.bb.ActionRole)
self.prev_button.setIcon(QIcon(I('back.png')))
self.prev_button.clicked.connect(self.back_clicked)
self.log_button = self.bb.addButton(_('View log'), self.bb.ActionRole)
self.log_button.clicked.connect(self.view_log)
self.log_button.setIcon(QIcon(I('debug.png')))
self.ok_button.setVisible(False)
self.prev_button.setVisible(False)
self.identify_widget = IdentifyWidget(self.log, self)
self.identify_widget.rejected.connect(self.reject)
@ -857,12 +877,21 @@ class FullFetch(QDialog): # {{{
def book_selected(self, book):
self.next_button.setVisible(False)
self.ok_button.setVisible(True)
self.prev_button.setVisible(True)
self.book = book
self.stack.setCurrentIndex(1)
self.log('\n\n')
self.covers_widget.start(book, self.current_cover,
self.title, self.authors)
def back_clicked(self):
self.next_button.setVisible(True)
self.ok_button.setVisible(False)
self.prev_button.setVisible(False)
self.stack.setCurrentIndex(0)
self.covers_widget.cancel()
self.covers_widget.reset_covers()
def accept(self):
# Prevent the usual dialog accept mechanisms from working
pass

View File

@ -58,7 +58,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
self.device_to_formats_map = {}
for device in device_plugins():
n = device_name_for_plugboards(device)
self.device_to_formats_map[n] = set(device.FORMATS)
self.device_to_formats_map[n] = set(device.settings().format_map)
if getattr(device, 'CAN_DO_DEVICE_DB_PLUGBOARD', False):
self.device_to_formats_map[n].add('device_db')
if n not in self.devices:

View File

@ -206,7 +206,7 @@
<item>
<widget class="QCheckBox" name="opt_autolaunch_server">
<property name="text">
<string>Run server &amp;automatically on startup</string>
<string>Run server &amp;automatically when calibre starts</string>
</property>
</widget>
</item>

View File

@ -37,6 +37,7 @@ class SearchRestrictionMixin(object):
search = unicode(search)
if not search:
self.search_restriction.setCurrentIndex(0)
self._apply_search_restriction('')
else:
s = '*' + search
if self.search_restriction.count() > 1:

View File

@ -6,7 +6,6 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import urllib
from contextlib import closing
from lxml import html
@ -37,27 +36,16 @@ class AmazonDEKindleStore(StorePlugin):
def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.de/s/?url=search-alias%3Ddigital-text&field-keywords='
url = search_url + urllib.quote_plus(query)
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
doc = html.fromstring(f.read().decode('latin-1', 'replace'))
# Amazon has two results pages.
# 20110725: seems that is_shot is gone.
# is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])')
# # Horizontal grid of books.
# if is_shot:
# data_xpath = '//div[contains(@class, "result")]'
# format_xpath = './/div[@class="productTitle"]/text()'
# cover_xpath = './/div[@class="productTitle"]//img/@src'
# # Vertical list of books.
# else:
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
format_xpath = './/span[@class="format"]/text()'
cover_xpath = './/img[@class="productImage"]/@src'
# end is_shot else
for data in doc.xpath(data_xpath):
if counter <= 0:
@ -80,11 +68,9 @@ class AmazonDEKindleStore(StorePlugin):
title = ''.join(data.xpath('.//div[@class="title"]/a/text()'))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()'))
# if is_shot:
# author = format.split(' von ')[-1]
# else:
author = ''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()'))
author = author.split('von ')[-1]
if author.startswith('von '):
author = author[4:]
counter -= 1

View File

@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from contextlib import closing
from lxml import html
from PyQt4.Qt import QUrl
from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
class AmazonFRKindleStore(StorePlugin):
'''
For comments on the implementation, please see amazon_plugin.py
'''
def open(self, parent=None, detail_item=None, external=False):
aff_id = {'tag': 'charhale-21'}
store_link = 'http://www.amazon.fr/livres-kindle/b?ie=UTF8&node=695398031&ref_=sa_menu_kbo1&_encoding=UTF8&tag=%(tag)s&linkCode=ur2&camp=1642&creative=19458' % aff_id
if detail_item:
aff_id['asin'] = detail_item
store_link = 'http://www.amazon.fr/gp/redirect.html?ie=UTF8&location=http://www.amazon.fr/dp/%(asin)s&tag=%(tag)s&linkCode=ur2&camp=1634&creative=6738' % aff_id
open_url(QUrl(store_link))
def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords='
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read().decode('latin-1', 'replace'))
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
format_xpath = './/span[@class="format"]/text()'
cover_xpath = './/img[@class="productImage"]/@src'
for data in doc.xpath(data_xpath):
if counter <= 0:
break
# Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). So we need
# to explicitly check if the item is a Kindle book and ignore it
# if it isn't.
format = ''.join(data.xpath(format_xpath))
if 'kindle' not in format.lower():
continue
# We must have an asin otherwise we can't easily reference the
# book later.
asin = ''.join(data.xpath("@name"))
cover_url = ''.join(data.xpath(cover_xpath))
title = ''.join(data.xpath('.//div[@class="title"]/a/text()'))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()'))
author = unicode(''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()')))
if author.startswith('de '):
author = author[3:]
counter -= 1
s = SearchResult()
s.cover_url = cover_url.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip()
s.formats = 'Kindle'
s.drm = SearchResult.DRM_UNKNOWN
yield s

View File

@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
import random
import re
import urllib
from contextlib import closing
from lxml import html
@ -122,12 +121,12 @@ class AmazonKindleStore(StorePlugin):
open_url(QUrl(store_link))
def search(self, query, max_results=10, timeout=60):
url = self.search_url + urllib.quote_plus(query)
url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
doc = html.fromstring(f.read().decode('latin-1', 'replace'))
# Amazon has two results pages.
is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])')

View File

@ -6,7 +6,6 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import urllib
from contextlib import closing
from lxml import html
@ -34,27 +33,16 @@ class AmazonUKKindleStore(StorePlugin):
def search(self, query, max_results=10, timeout=60):
search_url = 'http://www.amazon.co.uk/s/?url=search-alias%3Ddigital-text&field-keywords='
url = search_url + urllib.quote_plus(query)
url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
doc = html.fromstring(f.read().decode('latin-1', 'replace'))
# Amazon has two results pages.
# 20110725: seems that is_shot is gone.
# is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])')
# # Horizontal grid of books.
# if is_shot:
# data_xpath = '//div[contains(@class, "result")]'
# format_xpath = './/div[@class="productTitle"]/text()'
# cover_xpath = './/div[@class="productTitle"]//img/@src'
# # Vertical list of books.
# else:
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
format_xpath = './/span[@class="format"]/text()'
cover_xpath = './/img[@class="productImage"]/@src'
# end is_shot else
for data in doc.xpath(data_xpath):
if counter <= 0:
@ -77,11 +65,9 @@ class AmazonUKKindleStore(StorePlugin):
title = ''.join(data.xpath('.//div[@class="title"]/a/text()'))
price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()'))
# if is_shot:
# author = format.split(' von ')[-1]
# else:
author = ''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()'))
author = author.split('by ')[-1]
if author.startswith('by '):
author = author[3:]
counter -= 1

View File

@ -47,6 +47,9 @@ def get_parser(usage):
def get_db(dbpath, options):
if options.library_path is not None:
dbpath = options.library_path
if dbpath is None:
raise ValueError('No saved library path, either run the GUI or use the'
' --with-library option')
dbpath = os.path.abspath(dbpath)
return LibraryDatabase2(dbpath)

View File

@ -133,7 +133,7 @@ class Rule(object): # {{{
'lt': ('1', '', ''),
'gt': ('', '', '1')
}[action]
return "cmp(format_date(raw_field('%s'), 'yyyy-MM-dd'), %s, '%s', '%s', '%s')" % (col,
return "strcmp(format_date(raw_field('%s'), 'yyyy-MM-dd'), '%s', '%s', '%s', '%s')" % (col,
val, lt, eq, gt)
def multiple_condition(self, col, action, val, sep):

View File

@ -266,7 +266,7 @@ The following functions are available in addition to those described in single-f
* ``has_cover()`` -- return ``Yes`` if the book has a cover, otherwise return the empty string
* ``not(value)`` -- returns the string "1" if the value is empty, otherwise returns the empty string. This function works well with test or first_non_empty. You can have as many values as you want.
* ``list_difference(list1, list2, separator)`` -- return a list made by removing from `list1` any item found in `list2`, using a case-insensitive compare. The items in `list1` and `list2` are separated by separator, as are the items in the returned list.
* ``list_equals(list1, sep1, list2, sep2, yes_val, no_val) -- return `yes_val` if list1 and list2 contain the same items, otherwise return `no_val`. The items are determined by splitting each list using the appropriate separator character (`sep1` or `sep2`). The order of items in the lists is not relevant. The compare is case insensitive.
* ``list_equals(list1, sep1, list2, sep2, yes_val, no_val)`` -- return `yes_val` if `list1` and `list2` contain the same items, otherwise return `no_val`. The items are determined by splitting each list using the appropriate separator character (`sep1` or `sep2`). The order of items in the lists is not relevant. The compare is case insensitive.
* ``list_intersection(list1, list2, separator)`` -- return a list made by removing from `list1` any item not found in `list2`, using a case-insensitive compare. The items in `list1` and `list2` are separated by separator, as are the items in the returned list.
* ``list_sort(list, direction, separator)`` -- return list sorted using a case-insensitive sort. If `direction` is zero, the list is sorted ascending, otherwise descending. The list items are separated by separator, as are the items in the returned list.
* ``list_union(list1, list2, separator)`` -- return a list made by merging the items in list1 and list2, removing duplicate items using a case-insensitive compare. If items differ in case, the one in list1 is used. The items in list1 and list2 are separated by separator, as are the items in the returned list.

View File

@ -18,11 +18,13 @@ def create_mail(from_, to, subject, text=None, attachment_data=None,
assert text or attachment_data
from email.mime.multipart import MIMEMultipart
from email.utils import formatdate
outer = MIMEMultipart()
outer['Subject'] = subject
outer['To'] = to
outer['From'] = from_
outer['Date'] = formatdate(localtime=True)
outer.preamble = 'You will not see this in a MIME-aware mail reader.\n'
if text is not None: