diff --git a/Changelog.yaml b/Changelog.yaml index b50ae0e53c..43eb775233 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,52 @@ # new recipes: # - title: +- version: 0.8.46 + date: 2012-04-06 + + new features: + - title: "Auto adding: When automatically adding files from a folder, automatically convert the files to the current output format after adding. This can be turned off via Preferences->Adding Books->Automatic Adding." + tickets: [969053] + + - title: "E-book viewer: When reading a MOBI file that is actually a KF8 book, show the format as being KF8" + + - title: "Content server: Workaround for android stock browser not support HTTP AUTH." + + - title: "Edit metadata dialog: Change the remove unused series button to a clear series button (as the remove unused series function is now automatic)" + + - title: "Driver for PocketBook 622." + tickets: [969875] + + bug fixes: + - title: "Run metadata downloads in a separate process to workaround memory leaks in third party plugins. Also removes the need to break up bulk metadata downloads into 100 book batches." + + - title: "Make tag browser filtering work when capital letters are entered." + + - title: "EPUB metadata: Ignore urn:isbn: prefix from ISBN declaration when reading metadata" + + - title: "Get books: Fix feedbooks store not showing all available formats" + + - title: "KF8 Input: When the KF8 book has no metadata ToC, try to extract the ToC from the HTML instead." + tickets: [969238] + + - title: "Fix regression that broke access to Preferences via the Preferences item in the calibre menu on OS X" + tickets: [969418] + + - title: "Fix bug that ignored metadata specified on the command line when using calibredb add" + + improved recipes: + - OReilly Premium + - Real Clear + - Soldier's Magazine + - Rue89 + + new recipes: + - title: The Southern Star + author: watou + + - title: Buenos Aires Herald + author: Darko Miletic + - version: 0.8.45 date: 2012-03-30 diff --git a/recipes/ba_herald.recipe b/recipes/ba_herald.recipe index e966fd5676..939879ccaa 100644 --- a/recipes/ba_herald.recipe +++ b/recipes/ba_herald.recipe @@ -4,10 +4,8 @@ __copyright__ = '2012, Darko Miletic ' www.buenosairesherald.com ''' -import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup class BuenosAiresHerald(BasicNewsRecipe): title = 'Buenos Aires Herald' @@ -62,7 +60,7 @@ class BuenosAiresHerald(BasicNewsRecipe): lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}): diff --git a/recipes/caros_amigos.recipe b/recipes/caros_amigos.recipe index 48edceacba..7edcfb07c8 100644 --- a/recipes/caros_amigos.recipe +++ b/recipes/caros_amigos.recipe @@ -1,7 +1,5 @@ __copyright__ = '2011, Pablo Aldama ' - from calibre.web.feeds.news import BasicNewsRecipe - class AdvancedUserRecipe1311839910(BasicNewsRecipe): title = u'Caros Amigos' oldest_article = 20 @@ -9,9 +7,8 @@ class AdvancedUserRecipe1311839910(BasicNewsRecipe): language = 'pt_BR' __author__ = 'Pablo Aldama' - feeds = [(u'Caros Amigos', u'http://carosamigos.terra.com.br/index/index.php?format=feed&type=rss')] + feeds = [(u'Caros Amigos', u'http://carosamigos.terra.com.br/index2/index.php?format=feed&type=rss')] keep_only_tags = [dict(name='div', attrs={'class':['blog']}) ,dict(name='div', attrs={'class':['blogcontent']}) ] remove_tags = [dict(name='div', attrs={'class':'addtoany'})] - diff --git a/recipes/editoriali.recipe b/recipes/editoriali.recipe new file mode 100644 index 0000000000..1b0c558df4 --- /dev/null +++ b/recipes/editoriali.recipe @@ -0,0 +1,16 @@ +__version__ = 'v1.0' +__date__ = '7, April 2012' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1332847053(BasicNewsRecipe): + title = u'Editoriali' + __author__ = 'faber1971' + description = 'Leading articles on Italy by the best Italian editorials' + + oldest_article = 1 + max_articles_per_feed = 100 + auto_cleanup = True + conversion_options = {'linearize_tables': True} + masthead_url = 'http://folkbulletin.folkest.com/wp-content/uploads/editoriale1.jpg' + feeds = [(u'Micromega', u'http://temi.repubblica.it/micromega-online/feed/'), (u'Corriere della Sera', u'http://xml.corriereobjects.it/rss/editoriali.xml'), (u'La Stampa', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=25'), (u"Italia dall'estero", u'http://italiadallestero.info/feed')] diff --git a/recipes/melbourne_herald_sun.recipe b/recipes/melbourne_herald_sun.recipe new file mode 100644 index 0000000000..c24a4563af --- /dev/null +++ b/recipes/melbourne_herald_sun.recipe @@ -0,0 +1,85 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2009, Matthew Briggs' +__docformat__ = 'restructuredtext en' + +''' +http://www.herald sun.com.au/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class DailyTelegraph(BasicNewsRecipe): + title = u'Melbourne Herald Sun' + __author__ = u'Ray Hartley' + description = (u'Victorian and National News' + '. You will need to have a subscription to ' + 'http://www.heraldsun.com.au to get full articles.') + language = 'en_AU' + + oldest_article = 2 + needs_subscription = 'optional' + max_articles_per_feed = 30 + remove_javascript = True + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en_AU' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://resources2.news.com.au/cs/heraldsun/images/header-and-footer/logo.gif' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + .caption{display: inline; font-size: x-small} + """ + + conversion_options = { + 'comment' : description + , 'language' : language + } + + keep_only_tags = [dict(attrs={'id':'story'})] + remove_tags_before=dict(attrs={'class':'story-header'}) + remove_tags_after=dict(attrs={'class':'story-footer'}) + remove_tags = [ + dict(name=['meta','link','base','iframe','embed','object','media-metadata','media-reference','media-producer']) + ,dict(attrs={'class':['story-header-tools','story-sidebar','story-footer','story-summary-list']}) + ] + remove_attributes=['lang'] + + + feeds = [(u'Breaking News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_breakingnews_206.xml' ) + ,(u'Business' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_business_207.xml' ) + ,(u'Entertainment' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_entertainment_208.xml' ) + ,(u'Health Science' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_health_212.xml' ) + ,(u'Music' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_music_449.xml' ) + ,(u'National News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_national_209.xml' ) + ,(u'Sport News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_sport_213.xml' ) + ,(u'AFL News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml' ) + ,(u'State News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_vic_214.xml' ) + ,(u'Technology' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tech_215.xml' ) + ,(u'World News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_world_216.xml' ) + ,(u'Opinion', u'http://feeds.news.com.au/public/rss/2.0/heraldsun_opinion_210.xml' ) + ,(u'Andrew Bolt' , u'http://blogs.news.com.au/heraldsun/andrewbolt/index.php/xml/rss_2.0/heraldsun/hs_andrewbolt/') + ,(u'Afl - St Kilda' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_stkilda_565.xml') + ,(u'Terry McCrann' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tmccrann_224.xml' ) + ,(u'The Other side' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_otherside_211.xml')] + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username and self.password: + br.open('http://www.heraldsun.com.au') + br.select_form(nr=0) + br['username'] = self.username + br['password'] = self.password + raw = br.submit().read() + if '>log out' not in raw.lower(): + raise ValueError('Failed to log in to www.heralsun' + ' are your username and password correct?') + return br + + def get_article_url(self, article): + return article.id + + diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe index 4a9b9e54c3..17b8f241ff 100644 --- a/recipes/oreilly_premium.recipe +++ b/recipes/oreilly_premium.recipe @@ -1,4 +1,3 @@ -import string, re import time import traceback # above for debugging via stack @@ -6,22 +5,19 @@ from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup -import os, time, traceback, re, urlparse, sys, cStringIO -from collections import defaultdict -from functools import partial -from contextlib import nested, closing +import os -from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed +from calibre.web.feeds import feeds_from_index from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending # To Do: strip ads and graphics, Current Column lacks a title. # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. -# Newsletters: Talking Points Memos covered by cat12 +# Newsletters: Talking Points Memos covered by cat12 # ./ebook-convert --username xxx --password xxx -# this is derived from BasicNewsRecipe, so it can only overload those. +# this is derived from BasicNewsRecipe, so it can only overload those. # Soome of what we need is otherwise in article, so we have more copy to do than otherwise. class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' @@ -42,9 +38,9 @@ class OReillyPremium(BasicNewsRecipe): # Don't go down recursions = 0 max_articles_per_feed = 20 - + debugMessages = True - + # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], @@ -53,19 +49,19 @@ class OReillyPremium(BasicNewsRecipe): # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ] - + feeds = [ (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'), - (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), + (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'), (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'), (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5') ] - # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day. - + # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day. + # Note: Talking Points is broken in the above model; the site changed to more Ajax-y. # Now using RSS - + def get_browser(self): print("In get_browser") br = BasicNewsRecipe.get_browser() @@ -76,7 +72,7 @@ class OReillyPremium(BasicNewsRecipe): br['formPasswordField'] = self.password br.submit() return br - + # Returns the best-guess print url. # The second parameter (pageURL) is returned if nothing is found. def extractPrintURL(self, baseURL, pageURL, printString): @@ -90,17 +86,17 @@ class OReillyPremium(BasicNewsRecipe): tag = printText.parent tagURL = baseURL+tag['href'] return tagURL - + def stripBadChars(self, inString) : return inString.replace("\'", "") - - + + def parseGeneric(self, baseURL): - # Does a generic parsing of the articles. There are six categories (0-5) + # Does a generic parsing of the articles. There are six categories (0-5) # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # NoSpin and TV are generic fullReturn = [] - for i in range(len(self.catList)) : + for i in range(len(self.catList)) : articleList = [] print("In "+self.catList[i][0]+", index: "+ str(i)) soup = self.index_to_soup(self.catList[i][1]) @@ -110,7 +106,7 @@ class OReillyPremium(BasicNewsRecipe): # Problem: 0-2 create many in an array # 3-5 create one. # So no for-div for 3-5 - + if i == 0 : print("Starting TV Archives") for div in soup.findAll(self.catList[i][2], self.catList[i][3]): @@ -151,7 +147,7 @@ class OReillyPremium(BasicNewsRecipe): print("Returning") # print fullReturn return fullReturn - + # build_index() starts with: # try: @@ -161,7 +157,7 @@ class OReillyPremium(BasicNewsRecipe): # self.report_progress(0, _('Got feeds from index page')) # except NotImplementedError: # feeds = self.parse_feeds() - + # which in turn is from __init__.py #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100, # log=default_log): @@ -177,10 +173,10 @@ class OReillyPremium(BasicNewsRecipe): # max_articles_per_feed=max_articles_per_feed) # feeds.append(pfeed) # return feeds - + # use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article. - + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { @@ -201,7 +197,7 @@ class OReillyPremium(BasicNewsRecipe): masterList = self.parseGeneric(baseURL) #print(masterList) return masterList - + def preprocess_html(self, soup): print("In preprocess_html") refresh = soup.find('meta', {'http-equiv':'refresh'}) @@ -210,22 +206,22 @@ class OReillyPremium(BasicNewsRecipe): content = refresh.get('content').partition('=')[2] raw = self.browser.open('https://www.billoreilly.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace')) - + def build_index(self): print("In OReilly build_index()\n\n") feedsRSS = [] - self.report_progress(0, _('Fetching feeds...')) + self.report_progress(0, ('Fetching feeds...')) #try: feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, log=self.log) - self.report_progress(0, _('Got feeds from index page')) + self.report_progress(0, ('Got feeds from index page')) #except NotImplementedError: # feeds = self.parse_feeds() - # Now add regular feeds. + # Now add regular feeds. feedsRSS = self.parse_feeds() print ("feedsRSS is type "+feedsRSS.__class__.__name__) - + for articles in feedsRSS: print("articles is type "+articles.__class__.__name__) print("Title:" + articles.title) @@ -235,9 +231,9 @@ class OReillyPremium(BasicNewsRecipe): #feeds = FeedCollection(feeds) - self.report_progress(0, _('Trying to download cover...')) + self.report_progress(0, ('Trying to download cover...')) self.download_cover() - self.report_progress(0, _('Generating masthead...')) + self.report_progress(0, ('Generating masthead...')) self.masthead_path = None try: @@ -317,7 +313,7 @@ class OReillyPremium(BasicNewsRecipe): tp.putRequest(req, block=True, timeout=0) - self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads) + self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads) while True: try: tp.poll() @@ -331,8 +327,8 @@ class OReillyPremium(BasicNewsRecipe): with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: fi.write(html) self.create_opf(feeds) - self.report_progress(1, _('Feeds downloaded to %s')%index) + self.report_progress(1, ('Feeds downloaded to %s')%index) return index - + diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe index 2dfe56d207..cbf5a2f8e4 100644 --- a/recipes/real_clear.recipe +++ b/recipes/real_clear.recipe @@ -1,9 +1,9 @@ # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug -import string, re +import re import time from urlparse import urlparse from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString +from calibre.ebooks.BeautifulSoup import NavigableString class RealClear(BasicNewsRecipe): title = u'Real Clear' @@ -23,8 +23,8 @@ class RealClear(BasicNewsRecipe): recursions = 0 max_articles_per_feed = 400 debugMessages = True - - # Numeric parameter is type, controls whether we look for + + # Numeric parameter is type, controls whether we look for feedsets = [ ["Politics", "http://www.realclearpolitics.com/index.xml", 0], ["Policy", "http://www.realclearpolicy.com/index.xml", 0], @@ -41,17 +41,17 @@ class RealClear(BasicNewsRecipe): # Hints to extractPrintURL. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) - + printhints = [ ["realclear", "", '' , 'printpage'], ["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''], - ["politico.com", "Print", 'a', 'share-print'], + ["politico.com", "Print", 'a', 'share-print'], ["nationalreview.com", ">Print<", 'a', ''], ["reason.com", "", 'a', 'printer'] # The following are not supported due to JavaScripting, and would require obfuscated_article to handle - # forbes, + # forbes, # usatoday - just prints with all current crap anyhow - + ] # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s @@ -64,7 +64,7 @@ class RealClear(BasicNewsRecipe): # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1 # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all # which is at link rel="canonical" and at ', re.IGNORECASE | re.DOTALL), lambda match: '')] + (re.compile(r'