diff --git a/resources/recipes/guardian.recipe b/resources/recipes/guardian.recipe index 8055be0474..1aee9bdf07 100644 --- a/resources/recipes/guardian.recipe +++ b/resources/recipes/guardian.recipe @@ -16,7 +16,7 @@ class Guardian(BasicNewsRecipe): language = 'en_GB' oldest_article = 7 - max_articles_per_feed = 25 + max_articles_per_feed = 50 remove_javascript = True timefmt = ' [%a, %d %b %Y]' diff --git a/resources/recipes/time_magazine.recipe b/resources/recipes/time_magazine.recipe index c8b5596bbb..cfbdbce796 100644 --- a/resources/recipes/time_magazine.recipe +++ b/resources/recipes/time_magazine.recipe @@ -18,32 +18,37 @@ class Time(BasicNewsRecipe): language = 'en' remove_javascript = True - extra_css = ''' h1 {font-family:Arial,Sans-serif;} + extra_css = ''' h1 {font-family:georgia,serif;color:#000000;} + .mainHd{font-family:georgia,serif;color:#000000;} h2 {font-family:Arial,Sans-serif;} - .name{font-family:Arial,Sans-serif; font-size:x-small; } + .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; } .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;} .byline{font-family:Arial,Sans-serif; font-size:x-small ;} .photoBkt{ font-size:x-small ;} .vertPhoto{font-size:x-small ;} .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} .artTxt{font-family:georgia,serif;} - #article{font-family:georgia,serif;} + #content{font-family:georgia,serif;} .caption{font-family:georgia,serif; font-size:x-small;color:#333333;} .credit{font-family:georgia,serif; font-size:x-small;color:#999999;} a:link{color:#CC0000;} + .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;} ''' - - keep_only_tags = [ dict(name ="div",attrs = {"id" :["article",]}) , + keep_only_tags = [ dict(name ="div",attrs = {"id" :["content"]}) , dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,] - remove_tags = [ dict(name ="div",attrs = {'class':['articlePagination','nextUp',"rtCol","pagination","enlarge",]}), + remove_tags = [ dict(name ="div",attrs = {'class':['articleFooterNav','listsByTopic','articleTools2','relatedContent','sideContent','topBannerWrap','articlePagination','nextUp',"rtCol","pagination","enlarge","contentTools2",]}), dict(name ="span",attrs = {'class':['see']}), - dict(name ="div",attrs = {'id':['articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}), + dict(name ="div",attrs = {'id':['header','articleSideBar',"articleTools","articleFooter","cmBotLt","quigoPackage"]}), dict(name ="a",attrs = {'class':['listLink']}), + dict(name ="ul",attrs = {'id':['shareSocial','tabs']}), + dict(name ="li",attrs = {'class':['back']}), + dict(name ="ul",attrs = {'class':['navCount']}), ] - recursions = 1 - match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html'] + recursions = 10 + match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*'] preprocess_regexps = [(re.compile( r''), lambda m:'')] @@ -101,3 +106,8 @@ class Time(BasicNewsRecipe): else: ans.append(unicode(t)) return u' '.join(ans).replace(u'\xa0', u'').strip() + + def postprocess_html(self,soup,first): + for tag in soup.findAll(attrs ={'class':['artPag','pagination']}): + tag.extract() + return soup