Merge from trunk

This commit is contained in:
Charles Haley 2013-02-02 08:52:27 +01:00
commit 8839bf5459
10 changed files with 186 additions and 96 deletions

View File

@ -663,7 +663,7 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c
|app| freezes/crashes occasionally?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are three possible things I know of, that can cause this:
There are five possible things I know of, that can cause this:
* You recently connected an external monitor or TV to your computer. In
this case, whenever |app| opens a new window like the edit metadata

View File

@ -21,6 +21,10 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
encoding = 'utf8'
publisher = 'Globe & Mail'
language = 'en_CA'
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'
feeds = [
@ -44,12 +48,12 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
]
remove_tags_before = dict(name='h1')
remove_tags = [
dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
dict(href=lambda x: x and 'tracking=' in x),
{'class':['articleTools', 'pagination', 'Ads', 'topad',
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
#remove_tags_before = dict(name='h1')
#remove_tags = [
#dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
#dict(href=lambda x: x and 'tracking=' in x),
#{'class':['articleTools', 'pagination', 'Ads', 'topad',
#'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
japantimes.co.jp
'''
@ -13,59 +13,41 @@ class JapanTimes(BasicNewsRecipe):
language = 'en_JP'
category = 'news, politics, japan'
publisher = 'The Japan Times'
oldest_article = 5
oldest_article = 2
max_articles_per_feed = 150
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
publication_type = 'newspaper'
masthead_url = 'http://search.japantimes.co.jp/images/header_title.gif'
masthead_url = 'http://www.japantimes.co.jp/wp-content/themes/jt_theme/library/img/logo-japan-times.png'
extra_css = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'id':'printresult'})]
remove_tags = [
dict(name=['iframe','meta','link','embed','object','base'])
,dict(attrs={'id':'searchfooter'})
]
feeds = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')]
remove_attributes = ['border']
remove_tags_after = dict(name='div', attrs={'class':'entry'})
keep_only_tags = [dict(name='div', attrs={'class':'padding_block'})]
remove_tags = [
dict(name=['iframe','embed','object','base'])
,dict(attrs={'class':['meta_extras','related_articles']})
,dict(attrs={'id':'content_footer_menu'})
]
feeds = [
(u'News' , u'http://www.japantimes.co.jp/news/feed/' )
,(u'Opinion' , u'http://www.japantimes.co.jp/opinion/feed/' )
,(u'Life' , u'http://www.japantimes.co.jp/opinion/feed/' )
,(u'Community', u'http://www.japantimes.co.jp/community/feed/')
,(u'Culture' , u'http://www.japantimes.co.jp/culture/feed/' )
,(u'Sports' , u'http://www.japantimes.co.jp/sports/feed/' )
]
def get_article_url(self, article):
rurl = BasicNewsRecipe.get_article_url(self, article)
return rurl.partition('?')[0]
def print_version(self, url):
if '/rss/' in url:
return url.replace('.jp/rss/','.jp/print/')
if '/text/' in url:
return url.replace('.jp/text/','.jp/print/')
return url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
for item in soup.findAll('photo'):
item.name = 'div'
for item in soup.head.findAll('paragraph'):
item.extract()
for item in soup.findAll('wwfilename'):
item.extract()
for item in soup.findAll('jtcategory'):
item.extract()
for item in soup.findAll('nomooter'):
item.extract()
for item in soup.body.findAll('paragraph'):
item.name = 'p'
return soup
def preprocess_raw_html(self, raw, url):
return '<html><head>'+raw[raw.find('</head>'):]

View File

@ -1,15 +1,16 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
__copyright__ = '2012, 2013, Rémi Vanicat <vanicat at debian.org>'
'''
Lemonde.fr: Version abonnée
'''
import os, zipfile, re, time
from urllib2 import HTTPError
from calibre.constants import preferred_encoding
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ptempfile import PersistentTemporaryFile
@ -20,28 +21,38 @@ class LeMondeAbonne(BasicNewsRecipe):
__author__ = u'Rémi Vanicat'
description = u'Actualités'
category = u'Actualités, France, Monde'
publisher = 'Le Monde'
language = 'fr'
needs_subscription = True
no_stylesheets = True
smarten_punctuation = True
remove_attributes = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height']
extra_css = ''' li{margin:6pt 0}
ul{margin:0}
no_stylesheets = True
div.photo img{max-width:100%; border:0px transparent solid;}
div.photo{font-family:inherit; color:#333; text-align:center;}
div.photo p{text-align:justify;font-size:.9em; line-height:.9em;}
extra_css = u'''
h1{font-size:130%;}
.ariane{font-size:xx-small;}
.source{font-size:xx-small;}
.href{font-size:xx-small;}
.LM_caption{color:#666666; font-size:x-small;}
.main-article-info{font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
'''
@page{margin:10pt}
.ar-txt {color:#000; text-align:justify;}
h1{text-align:left; font-size:1.25em;}
.auteur{text-align:right; font-weight:bold}
.feed{text-align:right; font-weight:bold}
.po-ti2{font-weight:bold}
.fen-tt{font-weight:bold;font-size:1.1em}
'''
zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip'
coverurl_format = '/img/%y%m%d01.jpg'
path_format = "%y%m%d"
login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html'
keep_only_tags = [ dict(name="div", attrs={ 'class': 'po-prti' }), dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ]
keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ]
remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })]
article_id_pattern = re.compile("[0-9]+\\.html")
article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/'
@ -67,12 +78,16 @@ class LeMondeAbonne(BasicNewsRecipe):
second = time.time()
second += self.decalage
ltime = self.ltime = time.gmtime(second)
url = time.strftime(self.zipurl_format, ltime)
self.timefmt=strftime(" %A %d %B %Y", ltime)
response = browser.open(url)
for i in range(7):
self.ltime = time.gmtime(second)
self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding)
url = time.strftime(self.zipurl_format,self.ltime)
try:
response = browser.open(url)
continue
except HTTPError:
second -= 24*60*60
tmp = PersistentTemporaryFile(suffix='.zip')
self.report_progress(0.1,_('downloading zip file'))
@ -85,7 +100,7 @@ class LeMondeAbonne(BasicNewsRecipe):
zfile.extractall(self.output_dir)
zfile.close()
path = os.path.join(self.output_dir, time.strftime(self.path_format, ltime), "data")
path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data")
self.articles_path = path
@ -95,13 +110,33 @@ class LeMondeAbonne(BasicNewsRecipe):
flux = []
article_url = time.strftime(self.article_url_format, ltime)
article_url = time.strftime(self.article_url_format, self.ltime)
for i in range(nb_index_files):
filename = os.path.join(path, "selection_%d.html" % (i + 1))
tmp = open(filename,'r')
soup=BeautifulSoup(tmp)
soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES)
title=soup.find('span').contents[0]
if title=="Une":
title="À la une"
if title=="Evenement":
title="L'événement"
if title=="Planete":
title="Planète"
if title=="Economie - Entreprises":
title="Économie"
if title=="L'Oeil du Monde":
title="L'œil du Monde"
if title=="Enquete":
title="Enquête"
if title=="Editorial - Analyses":
title="Analyses"
if title=="Le Monde Economie":
title="Économie"
if title=="Le Monde Culture et idées":
title="Idées"
if title=="Le Monde Géo et politique":
title="Géopolitique"
tmp.close()
filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1))
@ -114,7 +149,7 @@ class LeMondeAbonne(BasicNewsRecipe):
article = {
'title': link.contents[0],
'url': article_url + article_id,
'descripion': '',
'description': '',
'content': ''
}
articles.append(article)
@ -129,4 +164,3 @@ class LeMondeAbonne(BasicNewsRecipe):
# Local Variables:
# mode: python
# End:

View File

@ -0,0 +1,65 @@
__license__ = 'GPL v3'
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
'''
www.libertaddigital.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LibertadDigital(BasicNewsRecipe):
title = 'Libertad Digital'
__author__ = 'Darko Miletic'
description = 'En Libertad Digital encontraras noticias y opinion sobre: España, el Mundo, Internet, sociedad, economia y deportes'
publisher = 'Libertad Digital S.A.'
category = 'noticias, ultima hora, españa, internet, mundo, economia, sociedad, Libertad Digital'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
publication_type = 'website'
masthead_url = 'http://s.libertaddigital.com/images/logo.gif'
extra_css = """
body{font-family: Verdana,sans-serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [
dict(name=['meta','link','iframe','embed','object'])
,dict(name='p', attrs={'class':'copyright'})
]
remove_attributes=['lang']
feeds = [
(u'Portada' , u'http://feeds2.feedburner.com/libertaddigital/deportes' )
,(u'Opinion' , u'http://feeds2.feedburner.com/libertaddigital/opinion' )
,(u'España' , u'http://feeds2.feedburner.com/libertaddigital/nacional' )
,(u'Internacional', u'http://feeds2.feedburner.com/libertaddigital/internacional')
,(u'Libre Mercado', u'http://feeds2.feedburner.com/libertaddigital/economia' )
,(u'Chic' , u'http://feeds2.feedburner.com/libertaddigital/el-candelabro')
,(u'Internet' , u'http://feeds2.feedburner.com/libertaddigital/internet' )
,(u'Deportes' , u'http://feeds2.feedburner.com/libertaddigital/deportes' )
]
def get_article_url(self, article):
return article.get('guid', None)
def print_version(self, url):
art, sep, rest = url.rpartition('/')
aart, asep, artid = art.rpartition('-')
return 'http://www.libertaddigital.com/c.php?op=imprimir&id=' + artid
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
www.thestar.com
'''
@ -11,18 +9,17 @@ from calibre.web.feeds.news import BasicNewsRecipe
class TheTorontoStar(BasicNewsRecipe):
title = 'The Toronto Star'
__author__ = 'Darko Miletic'
description = "Canada's largest daily newspaper"
description = "Thestar.com is Canada's largest online news site. Stay current with our sports, business entertainment news and more from the Toronto Star and thestar.com"
oldest_article = 2
language = 'en_CA'
max_articles_per_feed = 100
no_stylesheets = True
#auto_cleanup = True
#auto_cleanup_keep = '//div[@class="topsContent topsContentActive"]'
use_embedded_content = False
delay = 2
publisher = 'The Toronto Star'
category = "Toronto Star,Canada's largest daily newspaper,breaking news,classifieds,careers,GTA,Toronto Maple Leafs,sports,Toronto,news,editorial,The Star,Ontario,information,columnists,business,entertainment,births,deaths,automotive,rentals,weather,archives,Torstar,technology,Joseph Atkinson"
encoding = 'utf-8'
masthead_url = 'http://www.thestar.com/etc/designs/thestar/images/general/logoLrg.png'
conversion_options = {
'comments' : description
@ -30,23 +27,18 @@ class TheTorontoStar(BasicNewsRecipe):
,'publisher' : publisher
}
#keep_only_tags = [dict(name='div', attrs={'class':'ts-article'})]
#remove_tags_before = dict(name='div',attrs={'id':'ts-article_header'})
remove_tags_before = dict(name='div',attrs={'class':'article-headline'})
feeds = [
(u'News' , u'http://www.thestar.com/rss/?categories=293' )
,(u'Opinion' , u'http://www.thestar.com/rss/?categories=303' )
,(u'Business' , u'http://www.thestar.com/rss/?categories=294' )
,(u'Sports' , u'http://www.thestar.com/rss/?categories=295' )
,(u'Entertainment', u'http://www.toronto.com/rss?categories=6298' )
,(u'Living' , u'http://www.thestar.com/rss/?categories=297' )
,(u'Travel' , u'http://www.thestar.com/rss/list/1042246?' )
,(u'Science' , u'http://www.thestar.com/rss?categories=6481')
(u'News' , u'http://www.thestar.com/feeds.articles.news.rss' )
,(u'Opinion' , u'http://www.thestar.com/feeds.articles.opinion.rss' )
,(u'Business' , u'http://www.thestar.com/feeds.articles.business.rss' )
,(u'Sports' , u'http://www.thestar.com/feeds.articles.sports.rss' )
,(u'Entertainment', u'http://www.thestar.com/feeds.articles.entertainment.rss' )
,(u'Living' , u'http://www.thestar.com/feeds.articles.life.rss' )
,(u'Travel' , u'http://www.thestar.com/feeds.articles.life.travel.rss' )
,(u'Technology' , u'http://www.thestar.com/feeds.articles.life.technology.rss')
]
def print_version(self, url):
artl = url.rpartition('--')[0]
artid = artl.rpartition('/')[2]
return 'http://www.thestar.com/printarticle/' + artid
return url.replace('.html', '.print.html')

View File

@ -452,6 +452,13 @@ class SamsungGalaxy(TabletOutput):
'a resolution of 600x1280')
screen_size = comic_screen_size = (600, 1280)
class NookHD(TabletOutput):
name = 'Nook HD+'
short_name = 'nook_hd_plus'
description = _('Intended for the Nook HD+ and similar tablet devices with '
'a resolution of 1080x1920')
screen_size = comic_screen_size = (1080, 1920)
class SonyReaderOutput(OutputProfile):
name = 'Sony Reader'
@ -786,7 +793,7 @@ output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output,
SonyReader900Output, MSReaderOutput, MobipocketOutput, HanlinV3Output,
HanlinV5Output, CybookG3Output, CybookOpusOutput, KindleOutput,
iPadOutput, iPad3Output, KoboReaderOutput, TabletOutput, SamsungGalaxy,
SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput,
SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, NookHD,
IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput,
BambookOutput, NookColorOutput, PocketBook900Output, PocketBookPro912Output,
GenericEink, GenericEinkLarge, KindleFireOutput, KindlePaperWhiteOutput]

View File

@ -241,6 +241,11 @@ class KF8Writer(object):
j = 0
for tag in root.iterdescendants(etree.Element):
id_ = tag.attrib.get('id', None)
if id_ is None:
# Can happen during tweaking
id_ = tag.attrib.get('name', None)
if id_ is not None:
tag.attrib['id'] = id_
if id_ is not None or barename(tag.tag).lower() in aid_able_tags:
aid = aidbase + j
tag.attrib['aid'] = to_base(aid, base=32)

View File

@ -198,6 +198,7 @@ class NookColor(Nook):
class NookTablet(NookColor):
id = 'nook_tablet'
name = 'Nook Tablet/HD'
output_profile = 'nook_hd_plus'
class CybookG3(Device):