Merge from trunk

This commit is contained in:
Charles Haley 2011-03-07 18:58:28 +00:00
commit 568ac2a2d1
15 changed files with 383 additions and 100 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 521 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 262 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 375 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 768 B

View File

@ -0,0 +1,49 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ElPaisBabelia(BasicNewsRecipe):
title = 'El Pais Babelia'
__author__ = 'oneillpt'
description = 'El Pais Babelia'
INDEX = 'http://www.elpais.com/suple/babelia/'
language = 'es'
remove_tags_before = dict(name='div', attrs={'class':'estructura_2col'})
keep_tags = [dict(name='div', attrs={'class':'estructura_2col'})]
remove_tags = [dict(name='div', attrs={'class':'votos estirar'}),
dict(name='div', attrs={'id':'utilidades'}),
dict(name='div', attrs={'class':'info_relacionada'}),
dict(name='div', attrs={'class':'mod_apoyo'}),
dict(name='div', attrs={'class':'contorno_f'}),
dict(name='div', attrs={'class':'pestanias'}),
dict(name='div', attrs={'class':'otros_webs'}),
dict(name='div', attrs={'id':'pie'})
]
#no_stylesheets = True
remove_javascript = True
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
feeds = []
for section in soup.findAll('div', attrs={'class':'contenedor_nuevo'}):
section_title = self.tag_to_string(section.find('h1'))
articles = []
for post in section.findAll('a', href=True):
url = post['href']
if url.startswith('/'):
url = 'http://www.elpais.es'+url
title = self.tag_to_string(post)
if str(post).find('class=') > 0:
klass = post['class']
if klass != "":
self.log()
self.log('--> post: ', post)
self.log('--> url: ', url)
self.log('--> title: ', title)
self.log('--> class: ', klass)
articles.append({'title':title, 'url':url})
if articles:
feeds.append((section_title, articles))
return feeds

View File

@ -1,52 +1,54 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
evz.ro
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class EVZ_Ro(BasicNewsRecipe):
title = 'evz.ro'
__author__ = 'Darko Miletic'
description = 'News from Romania'
publisher = 'evz.ro'
category = 'news, politics, Romania'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
class EvenimentulZilei(BasicNewsRecipe):
title = u'Evenimentul Zilei'
__author__ = u'Silviu Cotoar\u0103'
description = ''
publisher = u'Evenimentul Zilei'
oldest_article = 5
language = 'ro'
masthead_url = 'http://www.evz.ro/fileadmin/images/logo.gif'
extra_css = ' body{font-family: Georgia,Arial,Helvetica,sans-serif } .firstP{font-size: 1.125em} .author,.articleInfo{font-size: small} '
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Stiri'
encoding = 'utf-8'
cover_url = 'http://www.evz.ro/fileadmin/images/evzLogo.png'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
preprocess_regexps = [
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE),lambda match: '<head><title>')
,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
]
keep_only_tags = [
dict(name='div', attrs={'class':'single'})
, dict(name='img', attrs={'id':'placeholder'})
, dict(name='a', attrs={'id':'holderlink'})
]
remove_tags = [
dict(name=['form','embed','iframe','object','base','link','script','noscript'])
,dict(attrs={'class':['section','statsInfo','email il']})
,dict(attrs={'id' :'gallery'})
]
remove_tags = [
dict(name='p', attrs={'class':['articleInfo']})
, dict(name='div', attrs={'id':['bannerAddoceansArticleJos']})
, dict(name='div', attrs={'id':['bannerAddoceansArticle']})
]
remove_tags_after = dict(attrs={'class':'section'})
keep_only_tags = [dict(attrs={'class':'single'})]
remove_attributes = ['height','width']
remove_tags_after = [
dict(name='div', attrs={'id':['bannerAddoceansArticleJos']})
]
feeds = [(u'Articles', u'http://www.evz.ro/rss.xml')]
feeds = [
(u'Feeds', u'http://www.evz.ro/rss.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
return self.adeify_images(soup)

View File

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
hit.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Hit(BasicNewsRecipe):
title = u'HIT'
__author__ = u'Silviu Cotoar\u0103'
description = 'IT'
publisher = 'HIT'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Reviste,IT'
encoding = 'utf-8'
cover_url = 'http://www.hit.ro/lib/images/frontend/hit_logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='h1', attrs={'class':'art_titl'})
, dict(name='div', attrs={'id':'continut_articol'})
]
feeds = [
(u'Feeds', u'http://www.hit.ro/rss')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
kamikazeonline.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Kamikaze(BasicNewsRecipe):
title = u'Kamikaze'
__author__ = u'Silviu Cotoar\u0103'
description = u'S\u0103pt\u0103m\u00e2nal sc\u0103pat de sub control'
publisher = 'Kamikaze'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Reviste'
encoding = 'utf-8'
cover_url = 'http://www.kamikazeonline.ro/wp-content/themes/kamikaze/images/kamikazeonline_header.gif'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'id':'content'})
]
remove_tags = [
dict(name='div', attrs={'class':['connect_confirmation_cell connect_confirmation_cell_no_like']})
, dict(name='h3', attrs={'id':['comments']})
, dict(name='ul', attrs={'class':['addtoany_list']})
, dict(name='p', attrs={'class':['postmetadata']})
]
remove_tags_after = [
dict(name='p', attrs={'class':['postmetadata']})
]
feeds = [
(u'Feeds', u'http://www.kamikazeonline.ro/feed/')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,36 +1,37 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com'
__author__ = 'Vadim Dyadkin'
from calibre.web.feeds.news import BasicNewsRecipe
class Computerra(BasicNewsRecipe):
title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430'
recursion = 50
oldest_article = 100
__author__ = 'Vadim Dyadkin'
max_articles_per_feed = 100
use_embedded_content = False
simultaneous_downloads = 5
language = 'ru'
description = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u044b, \u043e\u043a\u043e\u043b\u043e\u043d\u0430\u0443\u0447\u043d\u044b\u0435 \u0438 \u043e\u043a\u043e\u043b\u043e\u0444\u0438\u043b\u043e\u0441\u043e\u0444\u0441\u043a\u0438\u0435 \u0441\u0442\u0430\u0442\u044c\u0438, \u0433\u0430\u0434\u0436\u0435\u0442\u044b.'
keep_only_tags = [dict(name='div', attrs={'id': 'content'}),]
feeds = [(u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430', 'http://feeds.feedburner.com/ct_news/'),]
remove_tags = [dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}),
dict(name='ul', attrs={'class': "related_post"}),
dict(name='p', attrs={'class': 'info'}),
dict(name='a', attrs={'rel': 'tag', 'class': 'twitter-share-button', 'type': 'button_count'}),
dict(name='h2', attrs={}),]
extra_css = 'body { text-align: justify; }'
def get_article_url(self, article):
return article.get('feedburner:origLink', article.get('guid'))
#!/usr/bin/python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2010, Vadim Dyadkin, dyadkin@gmail.com'
__author__ = 'Vadim Dyadkin'
from calibre.web.feeds.news import BasicNewsRecipe
class Computerra(BasicNewsRecipe):
title = u'\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440\u0440\u0430'
oldest_article = 100
__author__ = 'Vadim Dyadkin (edited by A. Chewi)'
max_articles_per_feed = 50
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
conversion_options = {'linearize_tables' : True}
simultaneous_downloads = 5
language = 'ru'
description = u'Компьютерра: все новости про компьютеры, железо, новые технологии, информационные технологии'
keep_only_tags = [dict(name='div', attrs={'id': 'content'}),]
feeds = [(u'Компьютерра-Онлайн', 'http://feeds.feedburner.com/ct_news/'),]
remove_tags = [
dict(name='div', attrs={'id': ['fin', 'idc-container', 'idc-noscript',]}),
dict(name='ul', attrs={'class': "related_post"}),
dict(name='p', attrs={'class': 'info'}),
dict(name='a', attrs={'class': 'twitter-share-button'}),
dict(name='a', attrs={'type': 'button_count'}),
dict(name='h2', attrs={})
]
def print_version(self, url):
return url + '?print=true'

View File

@ -14,7 +14,7 @@ class NationalGeoRo(BasicNewsRecipe):
__author__ = u'Silviu Cotoar\u0103'
description = u'S\u0103 avem grij\u0103 de planet\u0103'
publisher = 'National Geographic'
oldest_article = 5
oldest_article = 35
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True

View File

@ -1,14 +1,14 @@
#!/usr/bin/env python
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#Based on Lars Jacob's Taz Digiabo recipe
#Based on veezh's original recipe and Kovid Goyal's New York Times recipe
__license__ = 'GPL v3'
__copyright__ = '2010, veezh'
__copyright__ = '2011, Snaab'
'''
www.nrc.nl
'''
import os, urllib2, zipfile
import os, zipfile
import time
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
@ -17,41 +17,59 @@ from calibre.ptempfile import PersistentTemporaryFile
class NRCHandelsblad(BasicNewsRecipe):
title = u'NRC Handelsblad'
description = u'De EPUB-versie van NRC'
description = u'De ePaper-versie van NRC'
language = 'nl'
lang = 'nl-NL'
needs_subscription = True
__author__ = 'veezh'
__author__ = 'Snaab'
conversion_options = {
'no_default_epub_cover' : True
}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://login.nrc.nl/login')
br.select_form(nr=0)
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def build_index(self):
today = time.strftime("%Y%m%d")
domain = "http://digitaleeditie.nrc.nl"
url = domain + "/digitaleeditie/helekrant/epub/nrc_" + today + ".epub"
# print url
#print url
try:
f = urllib2.urlopen(url)
except urllib2.HTTPError:
br = self.get_browser()
f = br.open(url)
except:
self.report_progress(0,_('Kan niet inloggen om editie te downloaden'))
raise ValueError('Krant van vandaag nog niet beschikbaar')
tmp = PersistentTemporaryFile(suffix='.epub')
self.report_progress(0,_('downloading epub'))
tmp.write(f.read())
tmp.close()
zfile = zipfile.ZipFile(tmp.name, 'r')
self.report_progress(0,_('extracting epub'))
zfile.extractall(self.output_dir)
f.close()
br.close()
if zipfile.is_zipfile(tmp):
try:
zfile = zipfile.ZipFile(tmp.name, 'r')
zfile.extractall(self.output_dir)
self.report_progress(0,_('extracting epub'))
except zipfile.BadZipfile:
self.report_progress(0,_('BadZip error, continuing'))
tmp.close()
index = os.path.join(self.output_dir, 'content.opf')
index = os.path.join(self.output_dir, 'metadata.opf')
self.report_progress(1,_('epub downloaded and extracted'))

View File

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
trombon.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Trombon(BasicNewsRecipe):
title = u'Trombon'
__author__ = u'Silviu Cotoar\u0103'
description = u'Parodii si Pamflete'
publisher = u'Trombon'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Reviste,Fun'
encoding = 'utf-8'
cover_url = 'http://www.trombon.ro/i/trombon.gif'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'articol'})
]
remove_tags = [
dict(name='div', attrs={'class':['info_2']})
, dict(name='iframe', attrs={'scrolling':['no']})
]
remove_tags_after = [
dict(name='div', attrs={'id':'article_vote'})
]
feeds = [
(u'Feeds', u'http://feeds.feedburner.com/trombon/ABWb?format=xml')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
wall-street.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class WallStreetRo(BasicNewsRecipe):
title = u'Wall Street'
__author__ = u'Silviu Cotoar\u0103'
description = ''
publisher = 'Wall Street'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare'
encoding = 'utf-8'
cover_url = 'http://img.wall-street.ro/images/WS_new_logo.jpg'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'article_header'})
, dict(name='div', attrs={'class':'article_text'})
]
remove_tags = [
dict(name='p', attrs={'class':['page_breadcrumbs']})
, dict(name='div', attrs={'id':['article_user_toolbox']})
, dict(name='p', attrs={'class':['comments_count_container']})
, dict(name='div', attrs={'class':['article_left_column']})
]
remove_tags_after = [
dict(name='div', attrs={'class':'clearfloat'})
]
feeds = [
(u'Feeds', u'http://img.wall-street.ro/rssfeeds/wall-street.xml')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -131,9 +131,12 @@ class PageProcessor(list): # {{{
newsizey = int(newsizex / aspect)
deltax = 0
deltay = (SCRHEIGHT - newsizey) / 2
wand.size = (newsizex, newsizey)
wand.set_border_color(pw)
wand.add_border(pw, deltax, deltay)
if newsizex < 20000 and newsizey < 20000:
# Too large and resizing fails, so better
# to leave it as original size
wand.size = (newsizex, newsizey)
wand.set_border_color(pw)
wand.add_border(pw, deltax, deltay)
elif self.opts.wide:
# Keep aspect and Use device height as scaled image width so landscape mode is clean
aspect = float(sizex) / float(sizey)
@ -152,11 +155,15 @@ class PageProcessor(list): # {{{
newsizey = int(newsizex / aspect)
deltax = 0
deltay = (wscreeny - newsizey) / 2
wand.size = (newsizex, newsizey)
wand.set_border_color(pw)
wand.add_border(pw, deltax, deltay)
if newsizex < 20000 and newsizey < 20000:
# Too large and resizing fails, so better
# to leave it as original size
wand.size = (newsizex, newsizey)
wand.set_border_color(pw)
wand.add_border(pw, deltax, deltay)
else:
wand.size = (SCRWIDTH, SCRHEIGHT)
if SCRWIDTH < 20000 and SCRHEIGHT < 20000:
wand.size = (SCRWIDTH, SCRHEIGHT)
if not self.opts.dont_sharpen:
wand.sharpen(0.0, 1.0)

View File

@ -75,15 +75,20 @@ class SNBFile:
for i in range(self.plainBlock):
bzdc = bz2.BZ2Decompressor()
if (i < self.plainBlock - 1):
bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset;
bSize = self.blocks[self.binBlock + i + 1].Offset - self.blocks[self.binBlock + i].Offset
else:
bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset;
snbFile.seek(self.blocks[self.binBlock + i].Offset);
bSize = self.tailOffset - self.blocks[self.binBlock + i].Offset
snbFile.seek(self.blocks[self.binBlock + i].Offset)
try:
data = snbFile.read(bSize)
uncompressedData += bzdc.decompress(data)
if len(data) < 32768:
uncompressedData += bzdc.decompress(data)
else:
uncompressedData += data
except Exception, e:
print e
if len(uncompressedData) != self.plainStreamSizeUncompressed:
raise Exception()
f.fileBody = uncompressedData[plainPos:plainPos+f.fileSize]
plainPos += f.fileSize
elif f.attr & 0x01000000 == 0x01000000: