Merge from trunk

This commit is contained in:
Charles Haley 2011-09-30 09:44:34 +02:00
commit 3700d81d4a
14 changed files with 322 additions and 66 deletions

View File

@ -18,6 +18,8 @@ class TheAmericanSpectator(BasicNewsRecipe):
use_embedded_content = False
language = 'en'
INDEX = 'http://spectator.org'
auto_cleanup = True
encoding = 'utf-8'
conversion_options = {
'comments' : description
@ -26,17 +28,6 @@ class TheAmericanSpectator(BasicNewsRecipe):
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'post inner'})
,dict(name='div', attrs={'class':'author-bio'})
]
remove_tags = [
dict(name='object')
,dict(name='div', attrs={'class':['col3','post-options','social']})
,dict(name='p' , attrs={'class':['letter-editor','meta']})
]
feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')]
def get_cover_url(self):

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1317341449(BasicNewsRecipe):
title = u'Diario La Republica'
__author__ = 'CAVALENCIA'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
language = 'es_CO'
feeds = [(u'Diario La Republica', u'http://www.larepublica.com.co/rss/larepublica.xml')]

View File

@ -2,12 +2,10 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1311790237(BasicNewsRecipe):
title = u'Periódico El Colombiano'
language = 'es_CO'
__author__ = 'BIGO-CAVA'
language = 'es_CO'
cover_url = 'http://www.elcolombiano.com/images/logoElColombiano348x46.gif'
remove_tags_before = dict(id='contenidoArt')
remove_tags_after = dict(id='enviaTips')

View File

@ -0,0 +1,54 @@
# coding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
class ColombiaElEspectador(BasicNewsRecipe):
title = u'Periódico el Espectador'
__author__ = 'BIGO-CAVA'
cover_url = 'http://www.elespectador.com/sites/elespectador.com/themes/elespectador/images/logo.gif'
#remove_tags_before = dict(id='fb-root')
remove_tags_before = dict(id='content')
remove_tags_after = [dict(name='div', attrs={'class':'paginacion'})]
language = 'es_CO'
#keep_only_tags = [dict(name='div', id='content')]
remove_tags = [dict(name='div', attrs={'class':'herramientas_nota'}),
dict(name='div', attrs={'class':'relpauta'}),
dict(name='div', attrs={'class':'recursosrelacionados'}),
dict(name='div', attrs={'class':'nav_negocios'})]
# dict(name='div', attrs={'class':'tags_playerrecurso'}),
# dict(name='div', attrs={'class':'ico-mail2'}),
# dict(name='div', attrs={'id':'caja-instapaper'}),
# dict(name='div', attrs={'class':'modulo herramientas'})]
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
masthead_url = 'http://www.elespectador.com/sites/elespectador.com/themes/elespectador/images/logo.gif'
publication_type = 'newspaper'
extra_css = """
p{text-align: justify; font-size: 100%}
body{ text-align: left; font-size:100% }
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
"""
feeds = [(u'Política ', u' http://www.elespectador.com/noticias/politica/feed'),
(u'Judicial', u'http://www.elespectador.com/noticias/judicial/feed'),
(u'Paz', u'http://www.elespectador.com/noticias/paz/feed'),
(u'Economía', u'http://www.elespectador.com/economia/feed'),
(u'Soy Periodista', u'http://www.elespectador.com/noticias/soyperiodista/feed'),
(u'Investigación', u'http://www.elespectador.com/noticias/investigacion/feed'),
(u'Educación', u'http://www.elespectador.com/noticias/educacion/feed'),
(u'Salud', u'http://www.elespectador.com/noticias/salud/feed'),
(u'El Mundo', u'http://www.elespectador.com/noticias/elmundo/feed'),
(u'Nacional', u'http://www.elespectador.com/noticias/nacional/feed'),
(u'Bogotá', u'http://www.elespectador.com/noticias/bogota/feed'),
(u'Deportes', u'http://www.elespectador.com/deportes/feed'),
(u'Tecnología', u'http://www.elespectador.com/tecnologia/feed'),
(u'Actualidad', u'http://www.elespectador.com/noticias/actualidad/feed'),
(u'Opinión', u'http://www.elespectador.com/opinion/feed'),
(u'Editorial', u'http://www.elespectador.com/opinion/editorial/feed')]

View File

@ -0,0 +1,50 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ColombiaElMundo02(BasicNewsRecipe):
title = u'Periódico El Mundo'
__author__ = 'BIGO-CAVA'
language = 'es_CO'
cover_url = 'http://www.elmundo.com/portal/img/logo_mundo2.png'
remove_tags_before = dict(id='miga_pan')
#remove_tags_before = [dict(name='div', attrs={'class':'contenido'})]
remove_tags_after = [dict(name='div', attrs={'class':'cuadro_opciones_new1'})]
#keep_only_tags = [dict(name='div', id='miga_pan')]
remove_tags = [dict(name='div', attrs={'class':'ruta'}),
dict(name='div', attrs={'class':'buscador'}),
dict(name='div', attrs={'class':'iconos'}),
dict(name='div', attrs={'class':'otros_iconos'}),
dict(name='div', attrs={'class':'cuadro_opciones_new1'}),
dict(name='div', attrs={'class':'otras_noticias'}),
dict(name='div', attrs={'class':'notas_relacionadas'}),
dict(name='div', attrs={'id':'lateral_2'})]
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
masthead_url = 'http://www.elmundo.com/portal/img/logo_mundo2.png'
publication_type = 'newspaper'
extra_css = """
p{text-align: justify; font-size: 100%}
body{ text-align: left; font-size:100% }
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
"""
feeds = [(u'Opinión', u'http://www.elmundo.com/images/rss/opinion.xml'),
(u'Economía', u'http://www.elmundo.com/images/rss/noticias_economia.xml'),
(u'Deportes', u'http://www.elmundo.com/images/rss/deportes.xml'),
(u'Política ', u'http://www.elmundo.com/images/rss/noticias_politica.xml'),
(u'Antioquia', u'http://www.elmundo.com/images/rss/noticias_antioquia.xml'),
(u'Nacional ', u'http://www.elmundo.com/images/rss/noticias_nacional.xml'),
(u'Internacional', u'http://www.elmundo.com/images/rss/noticias_internacional.xml'),
(u'Servicios Públicos', u'http://www.elmundo.com/images/rss/noticias_servicios_publicos.xml'),
(u'Infraestructura', u'http://www.elmundo.com/images/rss/noticias_infraestructura.xml'),
(u'Mobilidad', u'http://www.elmundo.com/images/rss/noticias_movilidad.xml'),
(u'Derechos Humanos', u'http://www.elmundo.com/images/rss/noticias_derechos_humanos.xml'),
(u'Vida', u'http://www.elmundo.com/images/rss/vida.xml'),
(u'Cultura', u'http://www.elmundo.com/images/rss/cultura.xml')]

View File

@ -2,18 +2,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ColombiaElTiempo02(BasicNewsRecipe):
title = u'Periódico el Tiempo'
language = 'es_CO'
__author__ = 'BIGO-CAVA'
language = 'es_CO'
cover_url = 'http://www.eltiempo.com/media/css/images/logo_footer.png'
remove_tags_before = dict(id='fb-root')
#remove_tags_before = dict(id='fb-root')
remove_tags_before = dict(id='contenidoArt')
remove_tags_after = [dict(name='div', attrs={'class':'modulo reporte'})]
keep_only_tags = [dict(name='div', id='contenidoArt')]
remove_tags = [dict(name='div', attrs={'class':'social-media'}),
dict(name='div', attrs={'class':'recomend-art'}),
dict(name='div', attrs={'class':'caja-facebook'}),
dict(name='div', attrs={'class':'caja-twitter'}),
dict(name='div', attrs={'class':'caja-buzz'}),

View File

@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
import re
class GN(BasicNewsRecipe):
EDITION = 0
__author__ = 'Piotr Kontek'
title = u'Gość niedzielny'
description = 'Weekly magazine'
encoding = 'utf-8'
no_stylesheets = True
language = 'pl'
remove_javascript = True
temp_files = []
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
source = br.response().read()
page = self.index_to_soup(source)
main_section = page.find('div',attrs={'class':'txt doc_prnt_prv'})
title = main_section.find('h2')
info = main_section.find('div', attrs={'class' : 'cf doc_info'})
authors = info.find(attrs={'class':'l'})
article = str(main_section.find('p', attrs={'class' : 'doc_lead'}))
first = True
for p in main_section.findAll('p', attrs={'class':None}, recursive=False):
if first and p.find('img') != None:
article = article + '<p>'
article = article + str(p.find('img')).replace('src="/files/','src="http://www.gosc.pl/files/')
article = article + '<font size="-2">'
for s in p.findAll('span'):
article = article + self.tag_to_string(s)
article = article + '</font></p>'
else:
article = article + str(p).replace('src="/files/','src="http://www.gosc.pl/files/')
first = False
html = unicode(title) + unicode(authors) + unicode(article)
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
def find_last_issue(self):
soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny')
#szukam zdjęcia i linka do porzedniego pełnego numeru
first = True
for d in soup.findAll('div', attrs={'class':'l release_preview_l'}):
img = d.find('img')
if img != None:
a = img.parent
self.EDITION = a['href']
self.title = img['alt']
self.cover_url = 'http://www.gosc.pl' + img['src']
if not first:
break
first = False
def parse_index(self):
self.find_last_issue()
soup = self.index_to_soup('http://www.gosc.pl' + self.EDITION)
feeds = []
#wstepniak
a = soup.find('div',attrs={'class':'release-wp-b'}).find('a')
articles = [
{'title' : self.tag_to_string(a),
'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/'),
'date' : '',
'description' : ''}
]
feeds.append((u'Wstępniak',articles))
#kategorie
for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}):
if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb':
main_block = self.index_to_soup('http://www.gosc.pl' + addr['href'])
articles = list(self.find_articles(main_block))
if len(articles) > 0:
section = addr.string
feeds.append((section, articles))
return feeds
def find_articles(self, main_block):
for a in main_block.findAll('div', attrs={'class':'prev_doc2'}):
art = a.find('a')
yield {
'title' : self.tag_to_string(art),
'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
'date' : '',
'description' : ''
}
for a in main_block.findAll('div', attrs={'class':'sr-document'}):
art = a.find('a')
yield {
'title' : self.tag_to_string(art),
'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
'date' : '',
'description' : ''
}

View File

@ -4,13 +4,13 @@ from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1311799898(BasicNewsRecipe):
title = u'Periódico Portafolio Colombia'
language = 'es_CO'
__author__ = 'BIGO-CAVA'
language = 'es_CO'
cover_url = 'http://www.portafolio.co/sites/portafolio.co/themes/portafolio_2011/logo.png'
remove_tags_before = dict(id='contenidoArt')
remove_tags_after = [dict(name='div', attrs={'class':'articulo-mas'})]
keep_only_tags = [dict(name='div', id='contenidoArt')]
oldest_article = 1
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True

View File

@ -1,5 +1,8 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '''
2010, Darko Miletic <darko.miletic at gmail.com>
2011, Przemyslaw Kryger <pkryger at gmail.com>
'''
'''
readitlaterlist.com
'''
@ -9,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Readitlater(BasicNewsRecipe):
title = 'Read It Later'
__author__ = 'Darko Miletic'
__author__ = 'Darko Miletic, Przemyslaw Kryger'
description = '''Personalized news feeds. Go to readitlaterlist.com to
setup up your news. Fill in your account
username, and optionally you can add password.'''
@ -23,9 +26,6 @@ class Readitlater(BasicNewsRecipe):
INDEX = u'http://readitlaterlist.com'
LOGIN = INDEX + u'/l'
feeds = [(u'Unread articles' , INDEX + u'/unread')]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None:
@ -37,12 +37,31 @@ class Readitlater(BasicNewsRecipe):
br.submit()
return br
def get_feeds(self):
self.report_progress(0, ('Fetching list of feeds...'))
lfeeds = []
i = 1
feedurl = self.INDEX + u'/unread/1'
while True:
title = u'Unread articles, page ' + str(i)
lfeeds.append((title, feedurl))
self.report_progress(0, ('Got ') + str(i) + (' feeds'))
i += 1
soup = self.index_to_soup(feedurl)
ritem = soup.find('a',attrs={'id':'next', 'class':'active'})
if ritem is None:
break
feedurl = self.INDEX + ritem['href']
if self.test:
return lfeeds[:2]
return lfeeds
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
ritem = soup.find('ul',attrs={'id':'list'})

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1317341570(BasicNewsRecipe):
title = u'Revista Semana'
__author__ = 'BIGO-CAVA'
language = 'es_CO'
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'Revista Semana', u'http://www.semana.com/rss/Semana_OnLine.xml')]

View File

@ -35,14 +35,14 @@ class Smh_au(BasicNewsRecipe):
, 'language' : language
}
remove_tags = [
dict(name='div', attrs={'id':['googleAds','moreGoogleAds','comments']})
,dict(name='div', attrs={'class':'cT-imageMultimedia'})
,dict(name=['object','embed','iframe'])
]
remove_tags_after = [dict(name='div',attrs={'class':'articleBody'})]
keep_only_tags = [dict(name='div',attrs={'id':'content'})]
remove_tags = [
dict(name='div',
attrs={'id':['googleAds','moreGoogleAds','comments',
'video-player-content']}),
dict(name='div', attrs={'class':'cT-imageMultimedia'}),
dict(name=['object','embed','iframe']),
dict(attrs={'class':'hidden'}),
dict(name=['link','meta','base','embed','object','iframe'])
]

View File

@ -77,8 +77,12 @@ class ANDROID(USBMS):
0xdeed : [0x0222],
},
# Viewsonic
0x0489 : { 0xc001 : [0x0226], 0xc004 : [0x0226], },
# Viewsonic/Vizio
0x0489 : {
0xc001 : [0x0226],
0xc004 : [0x0226],
0x8801 : [0x0226, 0x0227],
},
# Acer
0x502 : { 0x3203 : [0x0100, 0x224]},
@ -134,7 +138,7 @@ class ANDROID(USBMS):
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA',
'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT']
'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT', 'HYSTON', 'VIZIO']
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
@ -144,11 +148,12 @@ class ANDROID(USBMS):
'7', 'A956', 'A955', 'A43', 'ANDROID_PLATFORM', 'TEGRA_2',
'MB860', 'MULTI-CARD', 'MID7015A', 'INCREDIBLE', 'A7EB', 'STREAK',
'MB525', 'ANDROID2.3', 'SGH-I997', 'GT-I5800_CARD', 'MB612',
'GT-S5830_CARD', 'GT-S5570_CARD', 'MB870', 'MID7015A', 'ALPANDIGITAL']
'GT-S5830_CARD', 'GT-S5570_CARD', 'MB870', 'MID7015A',
'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
'__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL']
'__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL', 'ANDROID_MID']
OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -452,24 +452,26 @@ class BlockAttr(StyleObject, LRFObject):
@classmethod
def to_css(cls, obj, inline=False):
ans = ''
def item(line):
ans += '' if inline else '\t'
ans = '' if inline else '\t'
ans += line
ans += ' ' if inline else '\n'
return ans
if hasattr(obj, 'sidemargin'):
margin = str(obj.sidemargin) + 'px'
item('margin-left: %(m)s; margin-right: %(m)s;'%dict(m=margin))
ans += item('margin-left: %(m)s; margin-right: %(m)s;'%dict(m=margin))
if hasattr(obj, 'topskip'):
item('margin-top: %dpx;'%obj.topskip)
ans += item('margin-top: %dpx;'%obj.topskip)
if hasattr(obj, 'footskip'):
item('margin-bottom: %dpx;'%obj.footskip)
ans += item('margin-bottom: %dpx;'%obj.footskip)
if hasattr(obj, 'framewidth'):
item('border: solid %dpx'%obj.framewidth)
ans += item('border: solid %dpx'%obj.framewidth)
if hasattr(obj, 'framecolor') and obj.framecolor.a < 255:
item('border-color: %s;'%obj.framecolor.to_html())
ans += item('border-color: %s;'%obj.framecolor.to_html())
if hasattr(obj, 'bgcolor') and obj.bgcolor.a < 255:
item('background-color: %s;'%obj.bgcolor.to_html())
ans += item('background-color: %s;'%obj.bgcolor.to_html())
return ans
@ -480,39 +482,41 @@ class TextCSS(object):
@classmethod
def to_css(cls, obj, inline=False):
ans = ''
def item(line):
ans += '' if inline else '\t'
ans = '' if inline else '\t'
ans += line
ans += ' ' if inline else '\n'
return ans
fs = getattr(obj, 'fontsize', None)
if fs is not None:
item('font-size: %fpt;'%(int(fs)/10.))
ans += item('font-size: %fpt;'%(int(fs)/10.))
fw = getattr(obj, 'fontweight', None)
if fw is not None:
item('font-weight: %s;'%('bold' if int(fw) >= 700 else 'normal'))
ans += item('font-weight: %s;'%('bold' if int(fw) >= 700 else 'normal'))
fn = getattr(obj, 'fontfacename', None)
if fn is not None:
fn = cls.FONT_MAP[fn]
item('font-family: %s;'%fn)
ans += item('font-family: %s;'%fn)
fg = getattr(obj, 'textcolor', None)
if fg is not None:
fg = fg.to_html()
item('color: %s;'%fg)
ans += item('color: %s;'%fg)
bg = getattr(obj, 'textbgcolor', None)
if bg is not None:
bg = bg.to_html()
item('background-color: %s;'%bg)
ans += item('background-color: %s;'%bg)
al = getattr(obj, 'align', None)
if al is not None:
al = dict(head='left', center='center', foot='right')
item('text-align: %s;'%al)
ans += item('text-align: %s;'%al)
lh = getattr(obj, 'linespace', None)
if lh is not None:
item('text-align: %fpt;'%(int(lh)/10.))
ans += item('text-align: %fpt;'%(int(lh)/10.))
pi = getattr(obj, 'parindent', None)
if pi is not None:
item('text-indent: %fpt;'%(int(pi)/10.))
ans += item('text-indent: %fpt;'%(int(pi)/10.))
return ans

View File

@ -85,6 +85,8 @@ def render_data(mi, use_roman_numbers=True, all_fields=False):
for field, display in get_field_list(fm):
metadata = fm.get(field, None)
if field == 'sort':
field = 'title_sort'
if all_fields:
display = True
if (not display or not metadata or mi.is_null(field) or