From 767657c4602b9c7de29dbcbda966cb9cc457e29d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Oct 2009 12:24:21 -0600 Subject: [PATCH] Updated recipe for the Time Magazine --- resources/recipes/time_magazine.recipe | 59 +++++++++++++------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/resources/recipes/time_magazine.recipe b/resources/recipes/time_magazine.recipe index 9224d95962..c6aeb59a45 100644 --- a/resources/recipes/time_magazine.recipe +++ b/resources/recipes/time_magazine.recipe @@ -17,18 +17,36 @@ class Time(BasicNewsRecipe): no_stylesheets = True language = 'en' - extra_css = '''.headline {font-size: large;} - .fact { padding-top: 10pt } - h1 {font-family:Arial,Sans-serif} - .byline{font-family:Arial,Sans-serif; font-size:xx-small ;color:blue} - .timestamp{font-family:Arial,Sans-serif; font-size:x-small ;color:gray}''' - remove_tags_before = dict(id="artHd") - remove_tags_after = {'class':"ltCol"} - remove_tags = [ - {'class':['articleTools', 'enlarge', 'search','socialtools','blogtools','moretools','page','nextUp','next','subnav','RSS','line2','first','ybuzz','articlePagination','chiclets','imgcont','createListLink','rlinks','tabsWrap','pagination']}, - {'id':['quigoArticle', 'contentTools', 'articleSideBar', 'header', 'navTop','articleTools','feedmodule','feedmodule3','promos','footer','linksFooter','timeArchive','belt','relatedStories','packages','Features']}, - {'target':'_blank'}, - ] + extra_css = ''' h1 {font-family:Arial,Sans-serif;} + h2 {font-family:Arial,Sans-serif;} + .name{font-family:Arial,Sans-serif; font-size:x-small; } + .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;} + .byline{font-family:Arial,Sans-serif; font-size:x-small ;} + .photoBkt{ font-size:x-small ;} + .vertPhoto{font-size:x-small ;} + .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .artTxt{font-family:georgia,serif;} + #article{font-family:georgia,serif;} + .caption{font-family:georgia,serif; font-size:x-small;color:#333333;} + .credit{font-family:georgia,serif; font-size:x-small;color:#999999;} + a:link{color:#CC0000;} + ''' + + # remove_tags_before = dict(id="artHd") + # remove_tags_after = {'class':"ltCol"} + # remove_tags = [ + # {'class':['articleTools', 'enlarge', 'search','socialtools','blogtools','moretools','page','nextUp','next','subnav','RSS','line2','first','ybuzz','articlePagination','chiclets','imgcont','createListLink','rlinks','tabsWrap','pagination']}, + # {'id':['quigoArticle', 'contentTools', 'articleSideBar', 'header', 'navTop','articleTools','feedmodule','feedmodule3','promos','footer','linksFooter','timeArchive','belt','relatedStories','packages','Features']}, + # {'target':'_blank'}, + # ] + + keep_only_tags = [ dict(name ="div",attrs = {"id" :["article",]}) , + dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,] + remove_tags = [ dict(name ="div",attrs = {'class':['articlePagination','nextUp',"rtCol","pagination","enlarge",]}), + dict(name ="span",attrs = {'class':['see']}), + dict(name ="div",attrs = {'id':['articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}), + dict(name ="a",attrs = {'class':['listLink']}), + ] recursions = 1 match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html'] @@ -81,20 +99,3 @@ class Time(BasicNewsRecipe): else: ans.append(unicode(t)) return u' '.join(ans).replace(u'\xa0', u'').strip() - - def postprocess_html(self, soup, first_page): - div = soup.find(attrs={'class':'artPag'}) - if div is not None: - div.extract() - if not first_page: - for cls in ('photoBkt', 'artHd'): - div = soup.find(attrs={'class':cls}) - if div is not None: - div.extract() - div = soup.find(attrs={'class':'artTxt'}) - if div is not None: - p = div.find('p') - if p is not None: - p.extract() - - return soup