diff --git a/manual/faq.rst b/manual/faq.rst index 572d18b770..1f64fb54c7 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -663,7 +663,7 @@ Post any output you see in a help message on the `Forum ', re.DOTALL), lambda m: ''), ] - remove_tags_before = dict(name='h1') - remove_tags = [ - dict(name='div', attrs={'id':['ShareArticles', 'topStories']}), - dict(href=lambda x: x and 'tracking=' in x), - {'class':['articleTools', 'pagination', 'Ads', 'topad', - 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] + #remove_tags_before = dict(name='h1') + #remove_tags = [ + #dict(name='div', attrs={'id':['ShareArticles', 'topStories']}), + #dict(href=lambda x: x and 'tracking=' in x), + #{'class':['articleTools', 'pagination', 'Ads', 'topad', + #'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): diff --git a/recipes/icons/libertad_digital.png b/recipes/icons/libertad_digital.png new file mode 100644 index 0000000000..83ed5a6dda Binary files /dev/null and b/recipes/icons/libertad_digital.png differ diff --git a/recipes/japan_times.recipe b/recipes/japan_times.recipe index f5b90f2c05..80a68c5216 100644 --- a/recipes/japan_times.recipe +++ b/recipes/japan_times.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2011, Darko Miletic ' +__copyright__ = '2008-2013, Darko Miletic ' ''' japantimes.co.jp ''' @@ -13,59 +13,41 @@ class JapanTimes(BasicNewsRecipe): language = 'en_JP' category = 'news, politics, japan' publisher = 'The Japan Times' - oldest_article = 5 + oldest_article = 2 max_articles_per_feed = 150 no_stylesheets = True use_embedded_content = False encoding = 'utf8' publication_type = 'newspaper' - masthead_url = 'http://search.japantimes.co.jp/images/header_title.gif' + masthead_url = 'http://www.japantimes.co.jp/wp-content/themes/jt_theme/library/img/logo-japan-times.png' extra_css = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}' conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - , 'linearize_tables' : True + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } - - keep_only_tags = [dict(name='div', attrs={'id':'printresult'})] - remove_tags = [ - dict(name=['iframe','meta','link','embed','object','base']) - ,dict(attrs={'id':'searchfooter'}) - ] - feeds = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')] - remove_attributes = ['border'] + remove_tags_after = dict(name='div', attrs={'class':'entry'}) + keep_only_tags = [dict(name='div', attrs={'class':'padding_block'})] + remove_tags = [ + dict(name=['iframe','embed','object','base']) + ,dict(attrs={'class':['meta_extras','related_articles']}) + ,dict(attrs={'id':'content_footer_menu'}) + ] + feeds = [ + (u'News' , u'http://www.japantimes.co.jp/news/feed/' ) + ,(u'Opinion' , u'http://www.japantimes.co.jp/opinion/feed/' ) + ,(u'Life' , u'http://www.japantimes.co.jp/opinion/feed/' ) + ,(u'Community', u'http://www.japantimes.co.jp/community/feed/') + ,(u'Culture' , u'http://www.japantimes.co.jp/culture/feed/' ) + ,(u'Sports' , u'http://www.japantimes.co.jp/sports/feed/' ) + ] def get_article_url(self, article): rurl = BasicNewsRecipe.get_article_url(self, article) return rurl.partition('?')[0] - - def print_version(self, url): - if '/rss/' in url: - return url.replace('.jp/rss/','.jp/print/') - if '/text/' in url: - return url.replace('.jp/text/','.jp/print/') - return url - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('img'): - if not item.has_key('alt'): - item['alt'] = 'image' - for item in soup.findAll('photo'): - item.name = 'div' - for item in soup.head.findAll('paragraph'): - item.extract() - for item in soup.findAll('wwfilename'): - item.extract() - for item in soup.findAll('jtcategory'): - item.extract() - for item in soup.findAll('nomooter'): - item.extract() - for item in soup.body.findAll('paragraph'): - item.name = 'p' - return soup + + def preprocess_raw_html(self, raw, url): + return ''+raw[raw.find(''):] diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 8da4a97627..56156166dc 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -1,15 +1,16 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2012, Rémi Vanicat ' +__copyright__ = '2012, 2013, Rémi Vanicat ' ''' Lemonde.fr: Version abonnée ''' import os, zipfile, re, time +from urllib2 import HTTPError +from calibre.constants import preferred_encoding -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ptempfile import PersistentTemporaryFile @@ -20,28 +21,38 @@ class LeMondeAbonne(BasicNewsRecipe): __author__ = u'Rémi Vanicat' description = u'Actualités' category = u'Actualités, France, Monde' + publisher = 'Le Monde' language = 'fr' needs_subscription = True + no_stylesheets = True + smarten_punctuation = True + remove_attributes = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height'] + extra_css = ''' li{margin:6pt 0} + ul{margin:0} - no_stylesheets = True + div.photo img{max-width:100%; border:0px transparent solid;} + div.photo{font-family:inherit; color:#333; text-align:center;} + div.photo p{text-align:justify;font-size:.9em; line-height:.9em;} - extra_css = u''' - h1{font-size:130%;} - .ariane{font-size:xx-small;} - .source{font-size:xx-small;} - .href{font-size:xx-small;} - .LM_caption{color:#666666; font-size:x-small;} - .main-article-info{font-family:Arial,Helvetica,sans-serif;} - #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} - #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} - ''' + @page{margin:10pt} + .ar-txt {color:#000; text-align:justify;} + h1{text-align:left; font-size:1.25em;} + + .auteur{text-align:right; font-weight:bold} + .feed{text-align:right; font-weight:bold} + .po-ti2{font-weight:bold} + .fen-tt{font-weight:bold;font-size:1.1em} + ''' zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' coverurl_format = '/img/%y%m%d01.jpg' path_format = "%y%m%d" login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' - keep_only_tags = [ dict(name="div", attrs={ 'class': 'po-prti' }), dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] + keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] + + + remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })] article_id_pattern = re.compile("[0-9]+\\.html") article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' @@ -67,12 +78,16 @@ class LeMondeAbonne(BasicNewsRecipe): second = time.time() second += self.decalage - ltime = self.ltime = time.gmtime(second) - url = time.strftime(self.zipurl_format, ltime) - self.timefmt=strftime(" %A %d %B %Y", ltime) - - response = browser.open(url) + for i in range(7): + self.ltime = time.gmtime(second) + self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding) + url = time.strftime(self.zipurl_format,self.ltime) + try: + response = browser.open(url) + continue + except HTTPError: + second -= 24*60*60 tmp = PersistentTemporaryFile(suffix='.zip') self.report_progress(0.1,_('downloading zip file')) @@ -85,7 +100,7 @@ class LeMondeAbonne(BasicNewsRecipe): zfile.extractall(self.output_dir) zfile.close() - path = os.path.join(self.output_dir, time.strftime(self.path_format, ltime), "data") + path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data") self.articles_path = path @@ -95,13 +110,33 @@ class LeMondeAbonne(BasicNewsRecipe): flux = [] - article_url = time.strftime(self.article_url_format, ltime) + article_url = time.strftime(self.article_url_format, self.ltime) for i in range(nb_index_files): filename = os.path.join(path, "selection_%d.html" % (i + 1)) tmp = open(filename,'r') - soup=BeautifulSoup(tmp) + soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES) title=soup.find('span').contents[0] + if title=="Une": + title="À la une" + if title=="Evenement": + title="L'événement" + if title=="Planete": + title="Planète" + if title=="Economie - Entreprises": + title="Économie" + if title=="L'Oeil du Monde": + title="L'œil du Monde" + if title=="Enquete": + title="Enquête" + if title=="Editorial - Analyses": + title="Analyses" + if title=="Le Monde Economie": + title="Économie" + if title=="Le Monde Culture et idées": + title="Idées" + if title=="Le Monde Géo et politique": + title="Géopolitique" tmp.close() filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) @@ -114,7 +149,7 @@ class LeMondeAbonne(BasicNewsRecipe): article = { 'title': link.contents[0], 'url': article_url + article_id, - 'descripion': '', + 'description': '', 'content': '' } articles.append(article) @@ -129,4 +164,3 @@ class LeMondeAbonne(BasicNewsRecipe): # Local Variables: # mode: python # End: - diff --git a/recipes/libertad_digital.recipe b/recipes/libertad_digital.recipe new file mode 100644 index 0000000000..1a35e6995a --- /dev/null +++ b/recipes/libertad_digital.recipe @@ -0,0 +1,65 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +www.libertaddigital.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LibertadDigital(BasicNewsRecipe): + title = 'Libertad Digital' + __author__ = 'Darko Miletic' + description = 'En Libertad Digital encontraras noticias y opinion sobre: España, el Mundo, Internet, sociedad, economia y deportes' + publisher = 'Libertad Digital S.A.' + category = 'noticias, ultima hora, españa, internet, mundo, economia, sociedad, Libertad Digital' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'es' + remove_empty_feeds = True + publication_type = 'website' + masthead_url = 'http://s.libertaddigital.com/images/logo.gif' + extra_css = """ + body{font-family: Verdana,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [ + dict(name=['meta','link','iframe','embed','object']) + ,dict(name='p', attrs={'class':'copyright'}) + ] + remove_attributes=['lang'] + + + feeds = [ + (u'Portada' , u'http://feeds2.feedburner.com/libertaddigital/deportes' ) + ,(u'Opinion' , u'http://feeds2.feedburner.com/libertaddigital/opinion' ) + ,(u'España' , u'http://feeds2.feedburner.com/libertaddigital/nacional' ) + ,(u'Internacional', u'http://feeds2.feedburner.com/libertaddigital/internacional') + ,(u'Libre Mercado', u'http://feeds2.feedburner.com/libertaddigital/economia' ) + ,(u'Chic' , u'http://feeds2.feedburner.com/libertaddigital/el-candelabro') + ,(u'Internet' , u'http://feeds2.feedburner.com/libertaddigital/internet' ) + ,(u'Deportes' , u'http://feeds2.feedburner.com/libertaddigital/deportes' ) + ] + + def get_article_url(self, article): + return article.get('guid', None) + + def print_version(self, url): + art, sep, rest = url.rpartition('/') + aart, asep, artid = art.rpartition('-') + return 'http://www.libertaddigital.com/c.php?op=imprimir&id=' + artid + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/thestar.recipe b/recipes/thestar.recipe index f667b86472..59c3b43c6b 100644 --- a/recipes/thestar.recipe +++ b/recipes/thestar.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2013, Darko Miletic ' ''' www.thestar.com ''' @@ -11,18 +9,17 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheTorontoStar(BasicNewsRecipe): title = 'The Toronto Star' __author__ = 'Darko Miletic' - description = "Canada's largest daily newspaper" + description = "Thestar.com is Canada's largest online news site. Stay current with our sports, business entertainment news and more from the Toronto Star and thestar.com" oldest_article = 2 language = 'en_CA' max_articles_per_feed = 100 no_stylesheets = True - #auto_cleanup = True - #auto_cleanup_keep = '//div[@class="topsContent topsContentActive"]' use_embedded_content = False delay = 2 publisher = 'The Toronto Star' category = "Toronto Star,Canada's largest daily newspaper,breaking news,classifieds,careers,GTA,Toronto Maple Leafs,sports,Toronto,news,editorial,The Star,Ontario,information,columnists,business,entertainment,births,deaths,automotive,rentals,weather,archives,Torstar,technology,Joseph Atkinson" encoding = 'utf-8' + masthead_url = 'http://www.thestar.com/etc/designs/thestar/images/general/logoLrg.png' conversion_options = { 'comments' : description @@ -30,23 +27,18 @@ class TheTorontoStar(BasicNewsRecipe): ,'publisher' : publisher } - #keep_only_tags = [dict(name='div', attrs={'class':'ts-article'})] - #remove_tags_before = dict(name='div',attrs={'id':'ts-article_header'}) + remove_tags_before = dict(name='div',attrs={'class':'article-headline'}) feeds = [ - (u'News' , u'http://www.thestar.com/rss/?categories=293' ) - ,(u'Opinion' , u'http://www.thestar.com/rss/?categories=303' ) - ,(u'Business' , u'http://www.thestar.com/rss/?categories=294' ) - ,(u'Sports' , u'http://www.thestar.com/rss/?categories=295' ) - ,(u'Entertainment', u'http://www.toronto.com/rss?categories=6298' ) - ,(u'Living' , u'http://www.thestar.com/rss/?categories=297' ) - ,(u'Travel' , u'http://www.thestar.com/rss/list/1042246?' ) - ,(u'Science' , u'http://www.thestar.com/rss?categories=6481') + (u'News' , u'http://www.thestar.com/feeds.articles.news.rss' ) + ,(u'Opinion' , u'http://www.thestar.com/feeds.articles.opinion.rss' ) + ,(u'Business' , u'http://www.thestar.com/feeds.articles.business.rss' ) + ,(u'Sports' , u'http://www.thestar.com/feeds.articles.sports.rss' ) + ,(u'Entertainment', u'http://www.thestar.com/feeds.articles.entertainment.rss' ) + ,(u'Living' , u'http://www.thestar.com/feeds.articles.life.rss' ) + ,(u'Travel' , u'http://www.thestar.com/feeds.articles.life.travel.rss' ) + ,(u'Technology' , u'http://www.thestar.com/feeds.articles.life.technology.rss') ] def print_version(self, url): - artl = url.rpartition('--')[0] - artid = artl.rpartition('/')[2] - return 'http://www.thestar.com/printarticle/' + artid - - + return url.replace('.html', '.print.html') diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 7eba099bd2..d544496f19 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -452,6 +452,13 @@ class SamsungGalaxy(TabletOutput): 'a resolution of 600x1280') screen_size = comic_screen_size = (600, 1280) +class NookHD(TabletOutput): + name = 'Nook HD+' + short_name = 'nook_hd_plus' + description = _('Intended for the Nook HD+ and similar tablet devices with ' + 'a resolution of 1080x1920') + screen_size = comic_screen_size = (1080, 1920) + class SonyReaderOutput(OutputProfile): name = 'Sony Reader' @@ -786,7 +793,7 @@ output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output, SonyReader900Output, MSReaderOutput, MobipocketOutput, HanlinV3Output, HanlinV5Output, CybookG3Output, CybookOpusOutput, KindleOutput, iPadOutput, iPad3Output, KoboReaderOutput, TabletOutput, SamsungGalaxy, - SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, + SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, NookHD, IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput, BambookOutput, NookColorOutput, PocketBook900Output, PocketBookPro912Output, GenericEink, GenericEinkLarge, KindleFireOutput, KindlePaperWhiteOutput] diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index b1479ee7c3..47a27dd06c 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -241,6 +241,11 @@ class KF8Writer(object): j = 0 for tag in root.iterdescendants(etree.Element): id_ = tag.attrib.get('id', None) + if id_ is None: + # Can happen during tweaking + id_ = tag.attrib.get('name', None) + if id_ is not None: + tag.attrib['id'] = id_ if id_ is not None or barename(tag.tag).lower() in aid_able_tags: aid = aidbase + j tag.attrib['aid'] = to_base(aid, base=32) diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index 1cdcb85d4c..8016dd7dcd 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -198,6 +198,7 @@ class NookColor(Nook): class NookTablet(NookColor): id = 'nook_tablet' name = 'Nook Tablet/HD' + output_profile = 'nook_hd_plus' class CybookG3(Device):