Merge from main branch

This commit is contained in:
Tom Scholl 2011-03-31 23:00:15 +00:00
commit a396c9a90b
11 changed files with 416 additions and 392 deletions

View File

@ -1,134 +1,129 @@
#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe from datetime import datetime, timedelta
from datetime import datetime, timedelta from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup from calibre.utils.magick import Image, PixelWand
from calibre.utils.magick import Image, PixelWand from urllib2 import Request, urlopen, URLError
from urllib2 import Request, urlopen, URLError
class Estadao(BasicNewsRecipe):
class Estadao(BasicNewsRecipe): THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ LANGUAGE = 'pt_br'
LANGUAGE = 'pt_br' language = 'pt'
language = 'pt' LANGHTM = 'pt-br'
LANGHTM = 'pt-br' ENCODING = 'utf'
ENCODING = 'utf' ENCHTM = 'utf-8'
ENCHTM = 'utf-8' directionhtm = 'ltr'
directionhtm = 'ltr' requires_version = (0,7,47)
requires_version = (0,8,47) news = True
news = True
publication_type = 'newsportal' title = u'Estad\xe3o'
__author__ = 'Euler Alves'
title = u'Estadao' description = u'Brazilian news from Estad\xe3o'
__author__ = 'Euler Alves' publisher = u'Estad\xe3o'
description = u'Brazilian news from Estad\xe3o' category = 'news, rss'
publisher = u'Estad\xe3o'
category = 'news, rss' oldest_article = 4
max_articles_per_feed = 100
oldest_article = 4 summary_length = 1000
max_articles_per_feed = 100
summary_length = 1000 remove_javascript = True
no_stylesheets = True
remove_javascript = True use_embedded_content = False
no_stylesheets = True remove_empty_feeds = True
use_embedded_content = False timefmt = ' [%d %b %Y (%a)]'
remove_empty_feeds = True
timefmt = ' [%d %b %Y (%a)]' hoje = datetime.now()-timedelta(days=2)
pubdate = hoje.strftime('%a, %d %b')
html2lrf_options = [ if hoje.hour<10:
'--comment', description hoje = hoje-timedelta(days=1)
,'--category', category CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg'
,'--publisher', publisher SCREENSHOT = 'http://estadao.com.br/'
] cover_margins = (0,0,'white')
masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})]
hoje = datetime.now()-timedelta(days=2) remove_tags = [
pubdate = hoje.strftime('%a, %d %b') dict(name='div',
if hoje.hour<10: attrs={'id':[
hoje = hoje-timedelta(days=1) 'bb-md-noticia-tabs'
CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg' ]})
SCREENSHOT = 'http://estadao.com.br/' ,dict(name='div',
cover_margins = (0,0,'white') attrs={'class':[
masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png' 'tags'
,'discussion'
keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})] ,'bb-gg adsense_container'
remove_tags = [ ]})
dict(name='div',
attrs={'id':[ ,dict(name='a')
'bb-md-noticia-tabs' ,dict(name='iframe')
]}) ,dict(name='link')
,dict(name='div', ,dict(name='script')
attrs={'class':[ ]
'tags'
,'discussion'
,'bb-gg adsense_container' feeds = [
]}) (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml')
,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml')
,dict(name='a') ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml')
,dict(name='iframe') ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml')
,dict(name='link') ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/')
,dict(name='script') ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml')
] ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml')
,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml')
feeds = [ ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml')
(u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml') ]
,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml')
,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml') conversion_options = {
,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') 'title' : title
,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/') ,'comments' : description
,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml') ,'publisher' : publisher
,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml') ,'tags' : category
,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml') ,'language' : LANGUAGE
,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml') ,'linearize_tables': True
] }
conversion_options = { def preprocess_html(self, soup):
'title' : title for item in soup.findAll(style=True):
,'comments' : description del item['style']
,'publisher' : publisher if not soup.find(attrs={'http-equiv':'Content-Language'}):
,'tags' : category meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
,'language' : LANGUAGE soup.head.insert(0,meta0)
,'linearize_tables': True if not soup.find(attrs={'http-equiv':'Content-Type'}):
} meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
soup.head.insert(0,meta1)
def preprocess_html(self, soup): return soup
for item in soup.findAll(style=True):
del item['style'] def postprocess_html(self, soup, first):
if not soup.find(attrs={'http-equiv':'Content-Language'}): #process all the images. assumes that the new html has the correct path
meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
soup.head.insert(0,meta0) iurl = tag['src']
if not soup.find(attrs={'http-equiv':'Content-Type'}): img = Image()
meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) img.open(iurl)
soup.head.insert(0,meta1) width, height = img.size
return soup print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
if img < 0:
def postprocess_html(self, soup, first): raise RuntimeError('Out of memory')
#process all the images. assumes that the new html has the correct path pw = PixelWand()
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): if( width > height and width > 590) :
iurl = tag['src'] print 'Rotate image'
img = Image() img.rotate(pw, -90)
img.open(iurl) img.save(iurl)
width, height = img.size return soup
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
pw = PixelWand() def get_cover_url(self):
if( width > height and width > 590) : if self.THUMBALIZR_API:
print 'Rotate image' cover_url = self.CAPA
img.rotate(pw, -90) pedido = Request(self.CAPA)
img.save(iurl) pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
return soup pedido.add_header('Accept-Charset',self.ENCHTM)
pedido.add_header('Referer',self.SCREENSHOT)
def get_cover_url(self): try:
cover_url = self.CAPA resposta = urlopen(pedido)
pedido = Request(self.CAPA) soup = BeautifulSoup(resposta)
pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') cover_item = soup.find('body')
pedido.add_header('Accept-Charset',self.ENCHTM) if cover_item:
pedido.add_header('Referer',self.SCREENSHOT) cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
try: return cover_url
resposta = urlopen(pedido) except URLError:
soup = BeautifulSoup(resposta) cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
cover_item = soup.find('body') return cover_url
if cover_item:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url
except URLError:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url

View File

@ -1,149 +1,151 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
from calibre.utils.magick import Image, PixelWand from calibre.utils.magick import Image, PixelWand
from urllib2 import Request, urlopen, URLError from urllib2 import Request, urlopen, URLError
class FolhaOnline(BasicNewsRecipe): class FolhaOnline(BasicNewsRecipe):
THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
LANGUAGE = 'pt_br' LANGUAGE = 'pt_br'
language = 'pt' language = 'pt'
LANGHTM = 'pt-br' LANGHTM = 'pt-br'
ENCODING = 'cp1252' ENCODING = 'cp1252'
ENCHTM = 'iso-8859-1' ENCHTM = 'iso-8859-1'
directionhtm = 'ltr' directionhtm = 'ltr'
requires_version = (0,8,47) requires_version = (0,7,47)
news = True news = True
publication_type = 'newsportal'
title = u'Folha de S\xE3o Paulo'
title = u'Folha de S\xE3o Paulo' __author__ = 'Euler Alves'
__author__ = 'Euler Alves' description = u'Brazilian news from Folha de S\xE3o Paulo'
description = u'Brazilian news from Folha de S\xE3o Paulo' publisher = u'Folha de S\xE3o Paulo'
publisher = u'Folha de S\xE3o Paulo' category = 'news, rss'
category = 'news, rss'
oldest_article = 4
oldest_article = 4 max_articles_per_feed = 100
max_articles_per_feed = 100 summary_length = 1000
summary_length = 1000
remove_javascript = True
remove_javascript = True no_stylesheets = True
no_stylesheets = True use_embedded_content = False
use_embedded_content = False remove_empty_feeds = True
remove_empty_feeds = True timefmt = ' [%d %b %Y (%a)]'
timefmt = ' [%d %b %Y (%a)]'
html2lrf_options = [
html2lrf_options = [ '--comment', description
'--comment', description ,'--category', category
,'--category', category ,'--publisher', publisher
,'--publisher', publisher ]
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
hoje = datetime.now()
hoje = datetime.now() pubdate = hoje.strftime('%a, %d %b')
pubdate = hoje.strftime('%a, %d %b') if hoje.hour<6:
if hoje.hour<6: hoje = hoje-timedelta(days=1)
hoje = hoje-timedelta(days=1) CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg'
CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg' SCREENSHOT = 'http://www1.folha.uol.com.br/'
SCREENSHOT = 'http://www1.folha.uol.com.br/' cover_margins = (0,0,'white')
cover_margins = (0,0,'white') masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})]
keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] remove_tags = [
remove_tags = [ dict(name='div',
dict(name='div', attrs={'id':[
attrs={'id':[ 'articleButton'
'articleButton' ,'bookmarklets'
,'bookmarklets' ,'ad-180x150-1'
,'ad-180x150-1' ,'contextualAdsArticle'
,'contextualAdsArticle' ,'articleEnd'
,'articleEnd' ,'articleComments'
,'articleComments' ]})
]}) ,dict(name='div',
,dict(name='div', attrs={'class':[
attrs={'class':[ 'openBox adslibraryArticle'
'openBox adslibraryArticle' ]})
]})
,dict(name='a')
,dict(name='a') ,dict(name='iframe')
,dict(name='iframe') ,dict(name='link')
,dict(name='link') ,dict(name='script')
,dict(name='script') ]
]
feeds = [
feeds = [ (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml')
(u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml')
,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml')
,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml')
,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml')
,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml')
,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml')
,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml') ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml')
,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') ,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/')
,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/') ,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/')
,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/') ,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/')
,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/') ,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/')
,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/') ,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml')
,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml') ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml')
,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml') ,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml')
,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml') ,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml')
,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml') ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml')
,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml') ,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml')
,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml') ,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml')
,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml') ,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml')
,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml') ,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml')
,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml') ,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml')
,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml') ]
]
conversion_options = { conversion_options = {
'title' : title 'title' : title
,'comments' : description ,'comments' : description
,'publisher' : publisher ,'publisher' : publisher
,'tags' : category ,'tags' : category
,'language' : LANGUAGE ,'language' : LANGUAGE
,'linearize_tables': True ,'linearize_tables': True
} }
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
if not soup.find(attrs={'http-equiv':'Content-Language'}): if not soup.find(attrs={'http-equiv':'Content-Language'}):
meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
soup.head.insert(0,meta0) soup.head.insert(0,meta0)
if not soup.find(attrs={'http-equiv':'Content-Type'}): if not soup.find(attrs={'http-equiv':'Content-Type'}):
meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
soup.head.insert(0,meta1) soup.head.insert(0,meta1)
return soup return soup
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
#process all the images. assumes that the new html has the correct path #process all the images. assumes that the new html has the correct path
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src'] iurl = tag['src']
img = Image() img = Image()
img.open(iurl) img.open(iurl)
width, height = img.size width, height = img.size
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
pw = PixelWand() if img < 0:
if( width > height and width > 590) : raise RuntimeError('Out of memory')
print 'Rotate image' pw = PixelWand()
img.rotate(pw, -90) if( width > height and width > 590) :
img.save(iurl) print 'Rotate image'
return soup img.rotate(pw, -90)
img.save(iurl)
def get_cover_url(self): return soup
cover_url = self.CAPA
pedido = Request(self.CAPA) def get_cover_url(self):
pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') cover_url = self.CAPA
pedido.add_header('Accept-Charset',self.ENCHTM) pedido = Request(self.CAPA)
pedido.add_header('Referer',self.SCREENSHOT) pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
try: pedido.add_header('Accept-Charset',self.ENCHTM)
resposta = urlopen(pedido) pedido.add_header('Referer',self.SCREENSHOT)
soup = BeautifulSoup(resposta) try:
cover_item = soup.find('body') resposta = urlopen(pedido)
if cover_item: soup = BeautifulSoup(resposta)
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' cover_item = soup.find('body')
return cover_url if cover_item:
except URLError: cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' return cover_url
return cover_url except URLError:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

After

Width:  |  Height:  |  Size: 714 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 392 B

View File

@ -1,37 +1,100 @@
__license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe
__copyright__ = '2010, NA' from datetime import datetime
''' from calibre.ebooks.BeautifulSoup import Tag
lifehacker.com from calibre.utils.magick import Image, PixelWand
'''
class LifeHacker(BasicNewsRecipe):
from calibre.web.feeds.news import BasicNewsRecipe THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
LANGUAGE = 'en'
class Lifehacker(BasicNewsRecipe): LANGHTM = 'en'
title = 'Lifehacker' language = 'en'
__author__ = 'Kovid Goyal' ENCODING = 'utf'
description = "Computers make us more productive. Yeah, right. Lifehacker recommends the software downloads and web sites that actually save time. Don't live to geek; geek to live." ENCHTM = 'utf-8'
publisher = 'lifehacker.com' requires_version = (0,7,47)
category = 'news, IT, Internet, gadgets, tips and tricks, howto, diy' news = True
oldest_article = 2
max_articles_per_feed = 100 title = u'LifeHacker'
no_stylesheets = True __author__ = 'Euler Alves'
encoding = 'utf-8' description = u'Tips, tricks, and downloads for getting things done.'
use_embedded_content = True publisher = u'lifehacker.com'
language = 'en' author = u'Adam Pash & Kevin Purdy & Adam Dachis & Whitson Gordon & Gina Trapani'
masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png' category = 'news, rss'
conversion_options = {
'comment' : description oldest_article = 4
, 'tags' : category max_articles_per_feed = 20
, 'publisher' : publisher summary_length = 1000
, 'language' : language
} remove_javascript = True
no_stylesheets = True
remove_tags = [ use_embedded_content = True
{'class': 'feedflare'}, remove_empty_feeds = True
] timefmt = ' [%d %b %Y (%a)]'
feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')] hoje = datetime.now()
pubdate = hoje.strftime('%a, %d %b')
def preprocess_html(self, soup): cover_url = 'http://api.thumbalizr.com/?api_key='+THUMBALIZR_API+'&url=http://lifehacker.com&width=600&quality=90'
return self.adeify_images(soup) cover_margins = (0,0,'white')
masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png'
remove_tags = [
{'class': 'feedflare'},
dict(name='div',
attrs={'class':[
'ad_container'
,'ad_300x250'
,'ad_interstitial'
,'share-wrap'
,'ad_300x600'
,'ad_perma-footer-adsense'
,'ad_perma-panorama'
,'ad panorama'
,'ad_container'
]})
,dict(name='div',
attrs={'id':[
'agegate_container'
,'agegate_container_rejected'
,'sharemenu-wrap'
]})
]
feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')]
conversion_options = {
'title' : title
,'comments' : description
,'publisher' : publisher
,'tags' : category
,'language' : LANGUAGE
,'linearize_tables': True
}
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
if not soup.find(attrs={'http-equiv':'Content-Language'}):
meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
soup.head.insert(0,meta0)
if not soup.find(attrs={'http-equiv':'Content-Type'}):
meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
soup.head.insert(0,meta1)
return soup
def postprocess_html(self, soup, first):
#process all the images. assumes that the new html has the correct path
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
width, height = img.size
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
if img < 0:
raise RuntimeError('Out of memory')
pw = PixelWand()
if( width > height and width > 590) :
print 'Rotate image'
img.rotate(pw, -90)
img.save(iurl)
return soup

View File

@ -5,7 +5,8 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time, glob import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time, \
glob, stat
from subprocess import check_call from subprocess import check_call
from tempfile import NamedTemporaryFile, mkdtemp from tempfile import NamedTemporaryFile, mkdtemp
from zipfile import ZipFile from zipfile import ZipFile
@ -344,6 +345,8 @@ class UploadUserManual(Command): # {{{
def build_plugin_example(self, path): def build_plugin_example(self, path):
from calibre import CurrentDir from calibre import CurrentDir
with NamedTemporaryFile(suffix='.zip') as f: with NamedTemporaryFile(suffix='.zip') as f:
os.fchmod(f.fileno(),
stat.S_IRUSR|stat.S_IRGRP|stat.S_IROTH|stat.S_IWRITE)
with CurrentDir(self.d(path)): with CurrentDir(self.d(path)):
with ZipFile(f, 'w') as zf: with ZipFile(f, 'w') as zf:
for x in os.listdir('.'): for x in os.listdir('.'):
@ -352,8 +355,8 @@ class UploadUserManual(Command): # {{{
for y in os.listdir(x): for y in os.listdir(x):
zf.write(os.path.join(x, y)) zf.write(os.path.join(x, y))
bname = self.b(path) + '_plugin.zip' bname = self.b(path) + '_plugin.zip'
subprocess.check_call(['scp', f.name, 'divok:%s/%s'%(DOWNLOADS, dest = '%s/%s'%(DOWNLOADS, bname)
bname)]) subprocess.check_call(['scp', f.name, dest])
def run(self, opts): def run(self, opts):
path = self.j(self.SRC, 'calibre', 'manual', 'plugin_examples') path = self.j(self.SRC, 'calibre', 'manual', 'plugin_examples')

View File

@ -110,4 +110,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
if resolve_entities: if resolve_entities:
raw = substitute_entites(raw) raw = substitute_entites(raw)
if encoding and encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
return raw, encoding return raw, encoding

View File

@ -198,8 +198,10 @@ class Metadata(object):
return copy.deepcopy(ans) return copy.deepcopy(ans)
def _clean_identifier(self, typ, val): def _clean_identifier(self, typ, val):
typ = icu_lower(typ).strip().replace(':', '').replace(',', '') if typ:
val = val.strip().replace(',', '|').replace(':', '|') typ = icu_lower(typ).strip().replace(':', '').replace(',', '')
if val:
val = val.strip().replace(',', '|').replace(':', '|')
return typ, val return typ, val
def set_identifiers(self, identifiers): def set_identifiers(self, identifiers):

View File

@ -503,7 +503,7 @@ if __name__ == '__main__': # tests {{{
( # This isbn not on amazon ( # This isbn not on amazon
{'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python', {'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
'authors':['Lutz']}, 'authors':['Lutz']},
[title_test('Learning Python: Powerful Object-Oriented Programming', [title_test('Learning Python, 3rd Edition',
exact=True), authors_test(['Mark Lutz']) exact=True), authors_test(['Mark Lutz'])
] ]

View File

@ -1132,7 +1132,7 @@ class DeviceBooksModel(BooksModel): # {{{
self.sorted_map = list(self.map) self.sorted_map = list(self.map)
else: else:
self.sorted_map = list(range(len(self.db))) self.sorted_map = list(range(len(self.db)))
self.sorted_map.sort(cmp=keygen, reverse=descending) self.sorted_map.sort(key=keygen, reverse=descending)
self.sorted_on = (self.column_map[col], order) self.sorted_on = (self.column_map[col], order)
self.sort_history.insert(0, self.sorted_on) self.sort_history.insert(0, self.sorted_on)
if hasattr(keygen, 'db'): if hasattr(keygen, 'db'):

View File

@ -1,8 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
"""html2text: Turn HTML into equivalent Markdown-structured text.""" """html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "2.39" # Last upstream version before changes
__author__ = "Aaron Swartz (me@aaronsw.com)" #__version__ = "2.39"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __license__ = 'GPL 3'
__copyright__ = '''
Copyright (c) 2011, John Schember <john@nachtimwald.com>
(C) 2004-2008 Aaron Swartz <me@aaronsw.com>
'''
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
# TODO: # TODO:
@ -11,7 +17,6 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
if not hasattr(__builtins__, 'True'): True, False = 1, 0 if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs import re, sys, urllib, htmlentitydefs, codecs
import sgmllib import sgmllib
import urlparse
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
try: from textwrap import wrap try: from textwrap import wrap
@ -145,9 +150,7 @@ class _html2text(sgmllib.SGMLParser):
self.outcount = 0 self.outcount = 0
self.start = 1 self.start = 1
self.space = 0 self.space = 0
self.a = []
self.astack = [] self.astack = []
self.acount = 0
self.list = [] self.list = []
self.blockquote = 0 self.blockquote = 0
self.pre = 0 self.pre = 0
@ -181,29 +184,6 @@ class _html2text(sgmllib.SGMLParser):
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
self.handle_tag(tag, None, 0) self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def handle_tag(self, tag, attrs, start): def handle_tag(self, tag, attrs, start):
attrs = fixattrs(attrs) attrs = fixattrs(attrs)
@ -268,34 +248,23 @@ class _html2text(sgmllib.SGMLParser):
if self.astack: if self.astack:
a = self.astack.pop() a = self.astack.pop()
if a: if a:
i = self.previousIndex(a) title = ''
if i is not None: if a.has_key('title'):
a = self.a[i] title = ' "%s"' % a['title']
else: self.o('](%s%s)' % (a['href'], title))
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
if tag == "img" and start: if tag == "img" and start:
attrsD = {} attrsD = {}
for (x, y) in attrs: attrsD[x] = y for (x, y) in attrs: attrsD[x] = y
attrs = attrsD attrs = attrsD
if attrs.has_key('src'): if attrs.has_key('src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '') alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![") self.o("![")
self.o(alt) self.o(alt)
self.o("]["+`attrs['count']`+"]") title = ''
if attrs.has_key('title'):
title = ' "%s"' % attrs['title']
self.o('](%s%s)' % (attrs['src'], title))
if tag == 'dl' and start: self.p() if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr() if tag == 'dt' and not start: self.pbr()
@ -373,7 +342,6 @@ class _html2text(sgmllib.SGMLParser):
self.out("\n") self.out("\n")
self.space = 0 self.space = 0
if self.p_p: if self.p_p:
self.out(('\n'+bq)*self.p_p) self.out(('\n'+bq)*self.p_p)
self.space = 0 self.space = 0
@ -382,22 +350,6 @@ class _html2text(sgmllib.SGMLParser):
if not self.lastWasNL: self.out(' ') if not self.lastWasNL: self.out(' ')
self.space = 0 self.space = 0
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end": if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items(): for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n") self.out(" *[" + abbr + "]: " + definition + "\n")