diff --git a/Changelog.yaml b/Changelog.yaml index b50ae0e53c..43eb775233 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,52 @@ # new recipes: # - title: +- version: 0.8.46 + date: 2012-04-06 + + new features: + - title: "Auto adding: When automatically adding files from a folder, automatically convert the files to the current output format after adding. This can be turned off via Preferences->Adding Books->Automatic Adding." + tickets: [969053] + + - title: "E-book viewer: When reading a MOBI file that is actually a KF8 book, show the format as being KF8" + + - title: "Content server: Workaround for android stock browser not support HTTP AUTH." + + - title: "Edit metadata dialog: Change the remove unused series button to a clear series button (as the remove unused series function is now automatic)" + + - title: "Driver for PocketBook 622." + tickets: [969875] + + bug fixes: + - title: "Run metadata downloads in a separate process to workaround memory leaks in third party plugins. Also removes the need to break up bulk metadata downloads into 100 book batches." + + - title: "Make tag browser filtering work when capital letters are entered." + + - title: "EPUB metadata: Ignore urn:isbn: prefix from ISBN declaration when reading metadata" + + - title: "Get books: Fix feedbooks store not showing all available formats" + + - title: "KF8 Input: When the KF8 book has no metadata ToC, try to extract the ToC from the HTML instead." + tickets: [969238] + + - title: "Fix regression that broke access to Preferences via the Preferences item in the calibre menu on OS X" + tickets: [969418] + + - title: "Fix bug that ignored metadata specified on the command line when using calibredb add" + + improved recipes: + - OReilly Premium + - Real Clear + - Soldier's Magazine + - Rue89 + + new recipes: + - title: The Southern Star + author: watou + + - title: Buenos Aires Herald + author: Darko Miletic + - version: 0.8.45 date: 2012-03-30 diff --git a/recipes/ba_herald.recipe b/recipes/ba_herald.recipe new file mode 100644 index 0000000000..939879ccaa --- /dev/null +++ b/recipes/ba_herald.recipe @@ -0,0 +1,80 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +www.buenosairesherald.com +''' + +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class BuenosAiresHerald(BasicNewsRecipe): + title = 'Buenos Aires Herald' + __author__ = 'Darko Miletic' + description = 'A world of information in a few words' + publisher = 'Editorial Nefir S.A.' + category = 'news, politics, Argentina' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en_AR' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://www.buenosairesherald.com/img/logo.jpg' + INDEX = 'http://www.buenosairesherald.com' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + h1{font-family: Georgia,serif} + #fecha{text-align: right; font-size: small} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [dict(name=['meta','link','iframe'])] + keep_only_tags = [dict(attrs={'class':'nota_texto p'})] + + + feeds = [ + (u'Argentina' , u'http://www.buenosairesherald.com/argentina' ) + ,(u'World' , u'http://www.buenosairesherald.com/world' ) + ,(u'Latin America' , u'http://www.buenosairesherald.com/latin-america' ) + ,(u'Entertainment' , u'http://www.buenosairesherald.com/entertainment' ) + ,(u'Sports' , u'http://www.buenosairesherald.com/sports' ) + ] + + def print_version(self, url): + artidraw = url.rpartition('/article/')[2] + artid = artidraw.partition('/')[0] + return 'http://www.buenosairesherald.com/articles/print.aspx?ix=' + artid + + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + articles = [] + soup = self.index_to_soup(feedurl) + for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}): + description = self.tag_to_string(item.h2) + atag = item.h2.find('a') + if atag and atag.has_key('href'): + url = self.INDEX + atag['href'] + title = description + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':description + }) + totalfeeds.append((feedtitle, articles)) + return totalfeeds diff --git a/recipes/caros_amigos.recipe b/recipes/caros_amigos.recipe index 48edceacba..7edcfb07c8 100644 --- a/recipes/caros_amigos.recipe +++ b/recipes/caros_amigos.recipe @@ -1,7 +1,5 @@ __copyright__ = '2011, Pablo Aldama ' - from calibre.web.feeds.news import BasicNewsRecipe - class AdvancedUserRecipe1311839910(BasicNewsRecipe): title = u'Caros Amigos' oldest_article = 20 @@ -9,9 +7,8 @@ class AdvancedUserRecipe1311839910(BasicNewsRecipe): language = 'pt_BR' __author__ = 'Pablo Aldama' - feeds = [(u'Caros Amigos', u'http://carosamigos.terra.com.br/index/index.php?format=feed&type=rss')] + feeds = [(u'Caros Amigos', u'http://carosamigos.terra.com.br/index2/index.php?format=feed&type=rss')] keep_only_tags = [dict(name='div', attrs={'class':['blog']}) ,dict(name='div', attrs={'class':['blogcontent']}) ] remove_tags = [dict(name='div', attrs={'class':'addtoany'})] - diff --git a/recipes/editoriali.recipe b/recipes/editoriali.recipe new file mode 100644 index 0000000000..1b0c558df4 --- /dev/null +++ b/recipes/editoriali.recipe @@ -0,0 +1,16 @@ +__version__ = 'v1.0' +__date__ = '7, April 2012' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1332847053(BasicNewsRecipe): + title = u'Editoriali' + __author__ = 'faber1971' + description = 'Leading articles on Italy by the best Italian editorials' + + oldest_article = 1 + max_articles_per_feed = 100 + auto_cleanup = True + conversion_options = {'linearize_tables': True} + masthead_url = 'http://folkbulletin.folkest.com/wp-content/uploads/editoriale1.jpg' + feeds = [(u'Micromega', u'http://temi.repubblica.it/micromega-online/feed/'), (u'Corriere della Sera', u'http://xml.corriereobjects.it/rss/editoriali.xml'), (u'La Stampa', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=25'), (u"Italia dall'estero", u'http://italiadallestero.info/feed')] diff --git a/recipes/icons/ba_herald.png b/recipes/icons/ba_herald.png new file mode 100644 index 0000000000..2b02a4ae93 Binary files /dev/null and b/recipes/icons/ba_herald.png differ diff --git a/recipes/melbourne_herald_sun.recipe b/recipes/melbourne_herald_sun.recipe new file mode 100644 index 0000000000..c24a4563af --- /dev/null +++ b/recipes/melbourne_herald_sun.recipe @@ -0,0 +1,85 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2009, Matthew Briggs' +__docformat__ = 'restructuredtext en' + +''' +http://www.herald sun.com.au/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class DailyTelegraph(BasicNewsRecipe): + title = u'Melbourne Herald Sun' + __author__ = u'Ray Hartley' + description = (u'Victorian and National News' + '. You will need to have a subscription to ' + 'http://www.heraldsun.com.au to get full articles.') + language = 'en_AU' + + oldest_article = 2 + needs_subscription = 'optional' + max_articles_per_feed = 30 + remove_javascript = True + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en_AU' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://resources2.news.com.au/cs/heraldsun/images/header-and-footer/logo.gif' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + .caption{display: inline; font-size: x-small} + """ + + conversion_options = { + 'comment' : description + , 'language' : language + } + + keep_only_tags = [dict(attrs={'id':'story'})] + remove_tags_before=dict(attrs={'class':'story-header'}) + remove_tags_after=dict(attrs={'class':'story-footer'}) + remove_tags = [ + dict(name=['meta','link','base','iframe','embed','object','media-metadata','media-reference','media-producer']) + ,dict(attrs={'class':['story-header-tools','story-sidebar','story-footer','story-summary-list']}) + ] + remove_attributes=['lang'] + + + feeds = [(u'Breaking News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_breakingnews_206.xml' ) + ,(u'Business' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_business_207.xml' ) + ,(u'Entertainment' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_entertainment_208.xml' ) + ,(u'Health Science' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_health_212.xml' ) + ,(u'Music' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_music_449.xml' ) + ,(u'National News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_national_209.xml' ) + ,(u'Sport News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_sport_213.xml' ) + ,(u'AFL News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml' ) + ,(u'State News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_vic_214.xml' ) + ,(u'Technology' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tech_215.xml' ) + ,(u'World News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_world_216.xml' ) + ,(u'Opinion', u'http://feeds.news.com.au/public/rss/2.0/heraldsun_opinion_210.xml' ) + ,(u'Andrew Bolt' , u'http://blogs.news.com.au/heraldsun/andrewbolt/index.php/xml/rss_2.0/heraldsun/hs_andrewbolt/') + ,(u'Afl - St Kilda' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_stkilda_565.xml') + ,(u'Terry McCrann' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tmccrann_224.xml' ) + ,(u'The Other side' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_otherside_211.xml')] + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username and self.password: + br.open('http://www.heraldsun.com.au') + br.select_form(nr=0) + br['username'] = self.username + br['password'] = self.password + raw = br.submit().read() + if '>log out' not in raw.lower(): + raise ValueError('Failed to log in to www.heralsun' + ' are your username and password correct?') + return br + + def get_article_url(self, article): + return article.id + + diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe index 9dc11059c4..17b8f241ff 100644 --- a/recipes/oreilly_premium.recipe +++ b/recipes/oreilly_premium.recipe @@ -1,45 +1,69 @@ -# Talking Points is not grabbing everything. -# The look is right, but only the last one added? -import re import time +import traceback +# above for debugging via stack from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup -# strip ads and graphics -# Current Column lacks a title. -# Talking Points Memo - shorten title - Remove year and Bill's name + +import os + + +from calibre.web.feeds import feeds_from_index +from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending + + +# To Do: strip ads and graphics, Current Column lacks a title. # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. # Newsletters: Talking Points Memos covered by cat12 +# ./ebook-convert --username xxx --password xxx +# this is derived from BasicNewsRecipe, so it can only overload those. +# Soome of what we need is otherwise in article, so we have more copy to do than otherwise. class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' __author__ = 'TMcN' - language = 'en' description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' + custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y') + title = 'Bill O\'Reilly Premium' auto_cleanup = True + conversion_options = {'linearize_tables': True} encoding = 'utf8' - needs_subscription = True + language = 'en' no_stylesheets = True - oldest_article = 20 + needs_subscription = True + oldest_article = 31 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 - max_articles_per_feed = 2000 + max_articles_per_feed = 20 debugMessages = True # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], - ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], - ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], - ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], - ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], + # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], + # ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], + # ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], + # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ] + feeds = [ + (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'), + (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), + (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'), + (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'), + (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5') + ] + # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day. + + # Note: Talking Points is broken in the above model; the site changed to more Ajax-y. + # Now using RSS + def get_browser(self): + print("In get_browser") br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') @@ -66,6 +90,7 @@ class OReillyPremium(BasicNewsRecipe): def stripBadChars(self, inString) : return inString.replace("\'", "") + def parseGeneric(self, baseURL): # Does a generic parsing of the articles. There are six categories (0-5) # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList @@ -73,6 +98,7 @@ class OReillyPremium(BasicNewsRecipe): fullReturn = [] for i in range(len(self.catList)) : articleList = [] + print("In "+self.catList[i][0]+", index: "+ str(i)) soup = self.index_to_soup(self.catList[i][1]) # Set defaults description = 'None' @@ -81,14 +107,12 @@ class OReillyPremium(BasicNewsRecipe): # 3-5 create one. # So no for-div for 3-5 - if i < 3 : + if i == 0 : + print("Starting TV Archives") for div in soup.findAll(self.catList[i][2], self.catList[i][3]): + print("Next DIV:") print(div) - if i == 1: - a = div.find('a', href=True) - else : - a = div - print(a) + a = div summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) @@ -96,82 +120,63 @@ class OReillyPremium(BasicNewsRecipe): continue # url = baseURL+re.sub(r'\?.*', '', a['href']) url = baseURL+a['href'] - if i < 2 : - url = self.extractPrintURL(baseURL, url, "Print this entry") - title = self.tag_to_string(a, use_alt=True).strip() - elif i == 2 : - # Daily Briefs - url = self.extractPrintURL(baseURL, url, "Print this entry") - title = div.contents[0] - if self.debugMessages : - print(title+" @ "+url) + url = self.extractPrintURL(baseURL, url, "Print this entry") + title = self.tag_to_string(a, use_alt=True).strip() articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) - elif i == 3 : # Stratfor - a = soup.find('a', self.catList[i][3]) - if a is None : - continue - url = baseURL+a['href'] - title = self.tag_to_string(a, use_alt=True).strip() - # Get Stratfor contents so we can get the real title. - stratSoup = self.index_to_soup(url) - title = stratSoup.html.head.title.string - stratIndex = title.find('Stratfor.com:', 0) - if (stratIndex > -1) : - title = title[stratIndex+14:-1] - # Look for first blogBody 2K, it is used as the article. + + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { @@ -182,12 +187,19 @@ class OReillyPremium(BasicNewsRecipe): # 'content' : The full article (can be an empty string). This is used by FullContentProfile # } # this is used instead of BasicNewsRecipe.parse_feeds(). + # it is called by download def parse_index(self): # Parse the page into Python Soup + print("Entering recipe print_index from:") + traceback.print_stack() + print("web") baseURL = "https://www.billoreilly.com" - return self.parseGeneric(baseURL) + masterList = self.parseGeneric(baseURL) + #print(masterList) + return masterList def preprocess_html(self, soup): + print("In preprocess_html") refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: return soup @@ -195,3 +207,128 @@ class OReillyPremium(BasicNewsRecipe): raw = self.browser.open('https://www.billoreilly.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace')) + def build_index(self): + print("In OReilly build_index()\n\n") + feedsRSS = [] + self.report_progress(0, ('Fetching feeds...')) + #try: + feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, + max_articles_per_feed=self.max_articles_per_feed, + log=self.log) + self.report_progress(0, ('Got feeds from index page')) + #except NotImplementedError: + # feeds = self.parse_feeds() + # Now add regular feeds. + feedsRSS = self.parse_feeds() + print ("feedsRSS is type "+feedsRSS.__class__.__name__) + + for articles in feedsRSS: + print("articles is type "+articles.__class__.__name__) + print("Title:" + articles.title) + feeds.append(articles) + if not feeds: + raise ValueError('No articles found, aborting') + + #feeds = FeedCollection(feeds) + + self.report_progress(0, ('Trying to download cover...')) + self.download_cover() + self.report_progress(0, ('Generating masthead...')) + self.masthead_path = None + + try: + murl = self.get_masthead_url() + except: + self.log.exception('Failed to get masthead url') + murl = None + + if murl is not None: + # Try downloading the user-supplied masthead_url + # Failure sets self.masthead_path to None + self.download_masthead(murl) + if self.masthead_path is None: + self.log.info("Synthesizing mastheadImage") + self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg') + try: + self.default_masthead_image(self.masthead_path) + except: + self.log.exception('Failed to generate default masthead image') + self.masthead_path = None + + if self.test: + feeds = feeds[:2] + self.has_single_feed = len(feeds) == 1 + + index = os.path.join(self.output_dir, 'index.html') + + html = self.feeds2index(feeds) + with open(index, 'wb') as fi: + fi.write(html) + + self.jobs = [] + + if self.reverse_article_order: + for feed in feeds: + if hasattr(feed, 'reverse'): + feed.reverse() + + self.feed_objects = feeds + for f, feed in enumerate(feeds): + feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) + if not os.path.isdir(feed_dir): + os.makedirs(feed_dir) + + for a, article in enumerate(feed): + if a >= self.max_articles_per_feed: + break + art_dir = os.path.join(feed_dir, 'article_%d'%a) + if not os.path.isdir(art_dir): + os.makedirs(art_dir) + try: + url = self.print_version(article.url) + except NotImplementedError: + url = article.url + except: + self.log.exception('Failed to find print version for: '+article.url) + url = None + if not url: + continue + func, arg = (self.fetch_embedded_article, article) \ + if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \ + else \ + ((self.fetch_obfuscated_article if self.articles_are_obfuscated \ + else self.fetch_article), url) + req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), + {}, (f, a), self.article_downloaded, + self.error_in_article_download) + req.feed = feed + req.article = article + req.feed_dir = feed_dir + self.jobs.append(req) + + + self.jobs_done = 0 + tp = ThreadPool(self.simultaneous_downloads) + for req in self.jobs: + tp.putRequest(req, block=True, timeout=0) + + + self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads) + while True: + try: + tp.poll() + time.sleep(0.1) + except NoResultsPending: + break + for f, feed in enumerate(feeds): + print("Writing feeds for "+feed.title) + html = self.feed2index(f,feeds) + feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) + with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: + fi.write(html) + self.create_opf(feeds) + self.report_progress(1, ('Feeds downloaded to %s')%index) + + return index + + diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe index 19add74fcd..cbf5a2f8e4 100644 --- a/recipes/real_clear.recipe +++ b/recipes/real_clear.recipe @@ -1,5 +1,7 @@ # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug +import re import time +from urlparse import urlparse from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import NavigableString @@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe): # Don't go down recursions = 0 max_articles_per_feed = 400 - debugMessages = False + debugMessages = True # Numeric parameter is type, controls whether we look for feedsets = [ - ["Politics", "http://www.realclearpolitics.com/index.xml", 0], - ["Science", "http://www.realclearscience.com/index.xml", 0], + ["Politics", "http://www.realclearpolitics.com/index.xml", 0], + ["Policy", "http://www.realclearpolicy.com/index.xml", 0], + ["Science", "http://www.realclearscience.com/index.xml", 0], ["Tech", "http://www.realcleartechnology.com/index.xml", 0], # The feedburner is essentially the same as the top feed, politics. # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1], @@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe): ] # Hints to extractPrintURL. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. - printhints = [ + phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) + + printhints = [ ["realclear", "", '' , 'printpage'], ["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''], ["politico.com", "Print", 'a', 'share-print'], @@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe): # usatoday - just prints with all current crap anyhow ] + # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html + # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s + # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html + # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html + # Use the FULL PRINTPAGE URL; it formats it better too! + # + # NYT - try single page... + # Need special code - is it one page or several? Which URL? + # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1 + # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all + # which is at link rel="canonical" and at 0 and len(self.printhints[x][1]) == 0: + if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0: + # e.g. RealClear if self.debugMessages == True : - print("search1") + print("Search by href: "+self.printhints[x][self.phHrefSearch]) + printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch])) + elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: + if self.debugMessages == True : + print("Search 1: "+self.printhints[x][2]+" Attributes: ") + print(self.printhints[x][3]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) elif len(self.printhints[x][3])>0 : if self.debugMessages == True : print("search2") printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) else : + if self.debugMessages == True: + print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) if printFind is None: if self.debugMessages == True : print("Not Found") + # print(soup) + print("end soup\n\n"); continue + print(printFind) if isinstance(printFind, NavigableString)==False: if printFind['href'] is not None: + print("Check "+printFind['href']+" for base of "+baseURL) + if printFind['href'].find("http")!=0 : + return baseURL+printFind['href'] return printFind['href'] tag = printFind.parent print(tag) @@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe): def parse_index(self): # Parse the page into Python Soup + #articleList = [] ans = [] feedsCount = len(self.feedsets) for x in range(0,feedsCount): # should be ,4 @@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe): print(ans) return ans + diff --git a/recipes/soldiers.recipe b/recipes/soldiers.recipe index fb96e5a2ed..a1e9e5ca23 100644 --- a/recipes/soldiers.recipe +++ b/recipes/soldiers.recipe @@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + auto_cleanup = True + auto_cleanup_keep = '//div[@id="mediaWrapper"]' simultaneous_downloads = 1 delay = 4 max_connections = 1 @@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})] + #keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})] - remove_tags = [ - dict(name='div', attrs={'id':['addThis','comment','articleFooter']}) - ,dict(name=['object','link']) - ] + #remove_tags = [ + #dict(name='div', attrs={'id':['addThis','comment','articleFooter']}) + #,dict(name=['object','link']) + #] - feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )] + feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )] def get_cover_url(self): diff --git a/recipes/southernstar.recipe b/recipes/southernstar.recipe new file mode 100644 index 0000000000..69a81e2fb6 --- /dev/null +++ b/recipes/southernstar.recipe @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2012, watou' +''' +southernstar.ie +''' +import re +import tempfile +import os +import codecs + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, NavigableString + +class TheSouthernStar(BasicNewsRecipe): + + title = 'The Southern Star' + __author__ = 'watou' + description = 'West Cork\'s leading news and information provider since 1889' + NEWS_INDEX = 'http://www.southernstar.ie/news.php' + LOCAL_NOTES = 'http://www.southernstar.ie/localnotes.php' + SPORT_INDEX = 'http://www.southernstar.ie/sport.php' + CLASSIFIEDS = 'http://www.southernstar.ie/classifieds.php' + language = 'en_IE' + encoding = 'cp1252' + + publication_type = 'newspaper' + masthead_url = 'http://www.southernstar.ie/images/logo.gif' + remove_tags_before = dict(name='div', attrs={'class':'article'}) + remove_tags_after = dict(name='div', attrs={'class':'article'}) + remove_tags = [dict(name='div', attrs={'style':'width:300px; position:relative'}), + dict(name='form'), + dict(name='div', attrs={'class':'endpanel'})] + no_stylesheets = True + tempfiles = [] + pubdate = '' + + preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] + + def parse_index(self): + feeds = [] + seen_titles = set([]) + + articles = self.fetch_ss_articles(self.NEWS_INDEX, seen_titles) + if articles: + feeds.append(('News', articles)) + + articles = self.fetch_ss_notes(self.LOCAL_NOTES) + if articles: + feeds.append(('Local Notes', articles)) + + articles = self.fetch_ss_articles(self.SPORT_INDEX, seen_titles) + if articles: + feeds.append(('Sport', articles)) + + articles = self.fetch_ss_notes(self.CLASSIFIEDS) + if articles: + feeds.append(('Classifieds', articles)) + + return feeds + + def fetch_ss_articles(self, index, seen_titles): + articles = [] + soup = self.index_to_soup(index) + ts = soup.find('div', {'class':'article'}) + ds = self.tag_to_string(ts.find('strong')) + self.pubdate = ' ['+ds+']' + self.timefmt = ' [%s]'%ds + + for post in ts.findAll('h1'): + a = post.find('a', href=True) + title = self.tag_to_string(a) + if title in seen_titles: + continue + seen_titles.add(title) + url = a['href'] + if url.startswith('article'): + url = 'http://www.southernstar.ie/'+url + self.log('\tFound article:', title, 'at', url) + p = post.findNextSibling('p') + desc = None + if p is not None: + desc = str(p) + articles.append({'title':title, 'url':url, 'description':desc, + 'date':self.pubdate}) + + return articles + + def fetch_ss_notes(self, page): + articles = [] + + soup = self.index_to_soup(page) + ts = soup.find('div', {'class':'content'}) + for post in ts.findAll('h1'): + title = self.tag_to_string(post) + self.log('\tFound note:', title) + f = tempfile.NamedTemporaryFile(suffix='.html',delete=False) + f.close() + f = codecs.open(f.name, 'w+b', self.encoding, 'replace') + url = "file://" + f.name + f.write(u'

'+title+'

') + f.write(str(post.findNextSibling('p'))) + f.write(u'') + self.log('\tWrote note to', f.name) + f.close() + self.tempfiles.append(f) + articles.append({'title':title, 'url':url, 'date':self.pubdate}) + + return articles + + def postprocess_html(self, soup, first): + for table in soup.findAll('table', align='right'): + img = table.find('img') + if img is not None: + img.extract() + caption = self.tag_to_string(table).strip() + div = Tag(soup, 'div') + div['style'] = 'text-align:center' + div.insert(0, img) + div.insert(1, Tag(soup, 'br')) + if caption: + div.insert(2, NavigableString(caption)) + table.replaceWith(div) + + return soup + + def image_url_processor(self, baseurl, url): + return url.replace(' ','%20') + + def cleanup(self): + self.log('cleaning up') + for f in self.tempfiles: + os.unlink(f.name) + self.tempfiles = [] diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 80b37f329a..9285c0b2c2 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,57 +1,60 @@ import re from calibre.web.feeds.recipes import BasicNewsRecipe - class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'The Sun UK' cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' - description = 'A Recipe for The Sun tabloid UK - uses feed43' + description = 'A Recipe for The Sun tabloid UK' __author__ = 'Dave Asbury' - # last updated 20/2/12 + # last updated 7/4/12 language = 'en_GB' oldest_article = 1 max_articles_per_feed = 15 remove_empty_feeds = True no_stylesheets = True + #auto_cleanup = True + #articles_are_obfuscated = True masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' - encoding = 'cp1251' + encoding = 'UTF-8' - encoding = 'cp1252' remove_empty_feeds = True remove_javascript = True no_stylesheets = True extra_css = ''' - body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} - ''' + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + ''' preprocess_regexps = [ - (re.compile(r'