From 8934634e22aa3eb0d69ed73e488faff410a3fcea Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 6 Jul 2012 00:34:39 +0530 Subject: [PATCH] Updated Richmod Times-Dispatch --- recipes/heritage_foundation.recipe | 3 +- recipes/richmond_times_dispatch.recipe | 149 ++++++++++++++++--------- 2 files changed, 96 insertions(+), 56 deletions(-) diff --git a/recipes/heritage_foundation.recipe b/recipes/heritage_foundation.recipe index 80589d3d14..17b70dff14 100644 --- a/recipes/heritage_foundation.recipe +++ b/recipes/heritage_foundation.recipe @@ -1,7 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe class HeritageFoundation(BasicNewsRecipe): title = u'The Heritage Foundation' - custom_title = "The Heritage Foundation" description = 'Founded in 1973, The Heritage Foundation is a research and educational institution—a think tank—\ whose mission is to formulate and promote conservative public policies based on the principles of free enterprise, limited government, \ individual freedom, traditional American values, and a strong national defense.' @@ -25,7 +24,7 @@ individual freedom, traditional American values, and a strong national defense.' remove_empty_feeds = True auto_cleanup = True - conversion_options = { 'title' : custom_title, + conversion_options = { 'comments' : description, 'tags' : tags, 'language' : language, diff --git a/recipes/richmond_times_dispatch.recipe b/recipes/richmond_times_dispatch.recipe index 163a6317ff..741bacbda3 100644 --- a/recipes/richmond_times_dispatch.recipe +++ b/recipes/richmond_times_dispatch.recipe @@ -1,59 +1,100 @@ +import re from calibre.web.feeds.recipes import BasicNewsRecipe -class AdvancedUserRecipe1335532466(BasicNewsRecipe): - title = u'Richmond Times-Dispatch' - description = 'News from Richmond, Virginia, USA' - __author__ = 'jde' - cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png' - language = 'en' - encoding = 'utf8' - oldest_article = 1 #days - max_articles_per_feed = 25 - needs_subscription = False - remove_javascript = True - recursions = 0 - use_embedded_content = False - no_stylesheets = True - auto_cleanup = True +class RichmondTimesDispatch(BasicNewsRecipe): + title = u'Richmond Times-Dispatch' + description = "The Richmond Times-Dispatch is the primary daily newspaper in Richmond, \ + the capital of Virginia, United States, as well as the Virginia cities of Petersburg, \ + Chester. Hopewell, Colonial Heights, Charlottesville, Lynchburg, Waynesboro, \ + and is also a default paper for rural regions of the state. \ + The RTD has published in some form for more than 150 years." + __author__ = '_reader' + __date__ = '05 July 2012' + __version__ = '1.4' + cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png' + masthead_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png' + language = 'en' + oldest_article = 1.5 #days + max_articles_per_feed = 100 + needs_subscription = False + publisher = 'timesdispatch.com' + category = 'news, commentary' + tags = 'news' + publication_type = 'newspaper' + no_stylesheets = True + use_embedded_content= False + encoding = None + simultaneous_downloads = 20 + recursions = 0 + remove_javascript = True + remove_empty_feeds = True + auto_cleanup = False + + conversion_options = { + 'comments' : description, + 'tags' : tags, + 'language' : language, + 'publisher' : publisher, + 'authors' : publisher, + 'smarten_punctuation' : True + } + + remove_tags_before = dict(id='hnews hentry item') + + remove_tags_after = dict(name='hr') + + remove_tags = [ + dict(name='div', attrs={'id':['mg_hd', 'mg_ft', 'sr_b', 'comments_left', 'comments_right']}) + ,dict(name='div', attrs={'class':['bottom_social','article_bottom']}) + ,dict(name='table', attrs={'class':['ap-mediabox-table', 'ap-htmltable-table', 'ap-photogallery-table', 'ap-htmlfragment-table']}) + ] + + + preprocess_regexps = [ + (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'

\s*http://www2.timesdispatch.*?

', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'

\s*', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda match: ''), #strip


line break + (re.compile(r'.', re.DOTALL|re.IGNORECASE), lambda match: ''), #strip
line break + (re.compile(r'\s*Richmond Times-Dispatch.*?', re.DOTALL|re.IGNORECASE), lambda match: ''), #strip
line break + ] + feeds = [ + ('News', 'http://www2.timesdispatch.com/list/feed/rss/news-archive'), + ('Breaking News', 'http://www2.timesdispatch.com/list/feed/rss/breaking-news'), + ('National News', 'http://www2.timesdispatch.com/list/feed/rss/national-news'), + ('Local News', 'http://www2.timesdispatch.com/list/feed/rss/local-news'), + ('Business', 'http://www2.timesdispatch.com/list/feed/rss/business'), + ('Local Business', 'http://www2.timesdispatch.com/list/feed/rss/local-business'), + ('Politics', 'http://www2.timesdispatch.com/list/feed/rss/politics'), + ('Virginia Politics', 'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'), + ('Sports', 'http://www2.timesdispatch.com/list/feed/rss/sports2'), + ('Health', 'http://www2.timesdispatch.com/feed/rss/lifestyles/health_med_fit/'), + ('Entertainment/Life', 'http://www2.timesdispatch.com/list/feed/rss/entertainment'), + ('Arts/Theatre', 'http://www2.timesdispatch.com/feed/rss/entertainment/arts_theatre/'), + ('Movies', 'http://www2.timesdispatch.com/list/feed/rss/movies'), + ('Music', 'http://www2.timesdispatch.com/list/feed/rss/music'), + ('Dining & Food', 'http://www2.timesdispatch.com/list/feed/rss/dining'), + ('Home & Garden', 'http://www2.timesdispatch.com/list/feed/rss/home-and-garden/'), + #inactive('Travel', 'http://www2.timesdispatch.com/feed/rss/travel/'), + ('Opinion', 'http://www2.timesdispatch.com/feed/rss/news/opinion/'), + ('Editorials', 'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'), + ('Columnists and Blogs', 'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'), + ('Opinion Columnists', 'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'), + ('Letters to the Editor', 'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'), + ('Traffic', 'http://www2.timesdispatch.com/list/feed/rss/traffic'), + ] -('News', -'http://www2.timesdispatch.com/list/feed/rss/news-archive'), -('Breaking News', -'http://www2.timesdispatch.com/list/feed/rss/breaking-news'), -('National News', -'http://www2.timesdispatch.com/list/feed/rss/national-news'), -('Local News', -'http://www2.timesdispatch.com/list/feed/rss/local-news'), -('Business', -'http://www2.timesdispatch.com/list/feed/rss/business'), -('Local Business', -'http://www2.timesdispatch.com/list/feed/rss/local-business'), -('Politics', -'http://www2.timesdispatch.com/list/feed/rss/politics'), -('Virginia Politics', -'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'), -('Editorials', -'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'), -('Columnists and Blogs', -'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'), -('Opinion Columnists', -'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'), -('Letters to the Editor', -'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'), -('Traffic', -'http://www2.timesdispatch.com/list/feed/rss/traffic'), -('Sports', -'http://www2.timesdispatch.com/list/feed/rss/sports2'), -('Entertainment/Life', -'http://www2.timesdispatch.com/list/feed/rss/entertainment'), -('Movies', -'http://www2.timesdispatch.com/list/feed/rss/movies'), -('Music', -'http://www2.timesdispatch.com/list/feed/rss/music'), -('Dining & Food', -'http://www2.timesdispatch.com/list/feed/rss/dining'), - - ] - + def print_version(self,url): + article_num = re.sub(r'(^.*)\-([0-9]{4,10})\/$', r'\g<2>', url) + ap_pat = re.compile('http') + #print '\nDEBUG>>>>>>>>: article_num: ', article_num + #print 'DEBUG>>>>>>>>: ap_pat.search(article_num): ', ap_pat.search(article_num) + if ap_pat.search(article_num): #AP article, no print url + #print 'DEBUG>>>>>>>>: AP URL: ', url + return url + else: + printURL = 'http://www2.timesdispatch.com/member-center/share-this/print/?content=ar' + article_num + return printURL