From 22003b492cdbadf5dd223c34c9a9e82d72e94abd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 5 Aug 2009 12:22:26 -0600 Subject: [PATCH] Improved recipes for The BBC, Slate, and NYT Headlines --- src/calibre/web/feeds/recipes/recipe_bbc.py | 39 ++- .../web/feeds/recipes/recipe_nytimes.py | 118 +++++--- src/calibre/web/feeds/recipes/recipe_slate.py | 266 ++++++++++++------ 3 files changed, 286 insertions(+), 137 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_bbc.py b/src/calibre/web/feeds/recipes/recipe_bbc.py index 0c9c5f60c2..f82401f987 100644 --- a/src/calibre/web/feeds/recipes/recipe_bbc.py +++ b/src/calibre/web/feeds/recipes/recipe_bbc.py @@ -10,23 +10,34 @@ from calibre.web.feeds.news import BasicNewsRecipe class BBC(BasicNewsRecipe): title = u'The BBC' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'Kovid Goyal ans Sujata Raman' description = 'Global news and current affairs from the British Broadcasting Corporation' language = _('English') + no_stylesheets = True + remove_tags = [dict(name='div', attrs={'class':'footer'}), + {'id' : ['popstory','blq-footer']}, + {'class' : ['arrup','links','relatedbbcsites','arr','promobottombg','bbccom_visibility_hidden', 'sharesb', 'sib606', 'mvtb', 'storyextra', 'sidebar1', 'bbccom_text','promotopbg', 'gppromo','promotopbg','bbccom_display_none']}, + ] - remove_tags = [dict(name='div', attrs={'class':'footer'}),] - + keep_only_tags = [dict(name='div', attrs={'class':'mainwrapper'})] extra_css = ''' - body{font-family:Arial,Helvetica,sans-serif; font-size:small;} + body{font-family:Arial,Helvetica,sans-serif; font-size:small; align:left} h1{font-size:large;} + .sh{font-size:large; font-weight:bold} + .cap{font-size:xx-small; } + .lu{font-size:xx-small; } + .ds{font-size:xx-small; } + .mvb{font-size:xx-small;} + .by1{font-size:x-small; color:#666666} + .byd{font-size:x-small;} ''' feeds = [ ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), - ('Enterntainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), + ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'), @@ -38,8 +49,22 @@ class BBC(BasicNewsRecipe): ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), ] + def postprocess_html(self, soup, first): - def print_version(self, url): - return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/') + for tag in soup.findAll(name= 'img', alt=""): + tag.extract() + + for item in soup.findAll(align = "right"): + del item['align'] + + for tag in soup.findAll(name=['table', 'tr', 'td']): + tag.name = 'div' + + return soup + + + + # def print_version(self, url): + # return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/') diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes.py b/src/calibre/web/feeds/recipes/recipe_nytimes.py index c73468b51c..d57bd6594c 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py @@ -8,7 +8,7 @@ nytimes.com import re from calibre import entity_to_unicode from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment class NYTimes(BasicNewsRecipe): @@ -42,36 +42,39 @@ class NYTimes(BasicNewsRecipe): # By default, no sections are skipped. excludeSectionKeywords = [] - # To skip sections containing the word 'Sports' or 'Dining', use: + # Add section keywords from the right column above to skip that section + # For example, to skip sections containing the word 'Sports' or 'Dining', use: # excludeSectionKeywords = ['Sports', 'Dining'] - # Fetch only Business and Technology - #excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] - + # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] # Fetch only Top Stories - #excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] + # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] # The maximum number of articles that will be downloaded - max_articles_per_feed = 50 + max_articles_per_feed = 40 timefmt = '' needs_subscription = True - remove_tags_after = dict(attrs={'id':['comments']}) - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', - 'clearfix', 'nextArticleLink clearfix','inlineSearchControl', - 'columnGroup','entry-meta','entry-response module','jumpLink','nav', - 'columnGroup advertisementColumnGroup', 'kicker entry-category']}), - dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', - 'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login', - 'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor', - 'adxLeaderboard']), - dict(name=['script', 'noscript', 'style','hr'])] + keep_only_tags = [ dict(attrs={ 'id':['article']})] + remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix', + 'inlineVideo left brightcove']}), + dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles', + 'portfolioInline','articleInline','readerscomment']}) ] + encoding = 'cp1252' no_stylesheets = True - extra_css = '.headline {text-align:left;}\n\ - .byline {font:monospace; margin-bottom:0px;}\n\ - .source {align:left;}\n\ - .credit {text-align:right;font-size:smaller;}\n' + extra_css = '.headline {text-align: left;}\n \ + .byline {font-family: monospace; \ + text-align: left; \ + margin-bottom: 0px;}\n \ + .timestamp {font-size: smaller;}\n \ + .source {text-align: left;}\n \ + .image {text-align: center;}\n \ + .credit {text-align: right; \ + font-size: smaller;}\n \ + .articleBody {text-align: left;}\n \ + .authorId {text-align: left; \ + font-style: italic;}\n ' def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -113,6 +116,8 @@ class NYTimes(BasicNewsRecipe): if docEncoding == '' : docEncoding = self.encoding + if self.verbose > 2: + self.log( " document encoding: '%s'" % docEncoding) if docEncoding != self.encoding : soup = get_the_soup(docEncoding, url_or_raw) @@ -189,7 +194,6 @@ class NYTimes(BasicNewsRecipe): key = self.sections[section] excluded = re.compile('|'.join(self.excludeSectionKeywords)) if excluded.search(key) or articles.has_key(key): - if self.verbose : self.log("Skipping section %s" % key) skipThisSection = True break @@ -200,8 +204,7 @@ class NYTimes(BasicNewsRecipe): # Extract the bylines and descriptions if (i.string is not None) and \ (i.string.strip() > "") and \ - not ('Comment' in str(i.__class__)) : - + not isinstance(i,Comment): contentString = i.strip().encode('utf-8') if contentString[0:3] == 'By ' : bylines.append(contentString) @@ -212,8 +215,6 @@ class NYTimes(BasicNewsRecipe): articleCount = len(sectionblock.findAll('span')) for (i,span) in enumerate(sectionblock.findAll('span')) : a = span.find('a', href=True) - #if not a: - #continue url = re.sub(r'\?.*', '', a['href']) url += '?pagewanted=all' @@ -234,15 +235,13 @@ class NYTimes(BasicNewsRecipe): # Check for duplicates duplicateFound = False if len(articles[feed]) > 1: - #print articles[feed] for article in articles[feed] : - #print "comparing %s\n %s\n" % (url, article['url']) if url == article['url'] : duplicateFound = True break - #print if duplicateFound: + # Continue fetching, don't add this article continue if not articles.has_key(feed): @@ -252,33 +251,42 @@ class NYTimes(BasicNewsRecipe): description=description, author=author, content='')) ans = self.sort_index_by(ans, {'Top Stories':-1}) - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + def preprocess_html(self, soup): refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: - return soup + return self.strip_anchors(soup) + content = refresh.get('content').partition('=')[2] raw = self.browser.open('http://www.nytimes.com'+content).read() - return BeautifulSoup(raw.decode('cp1252', 'replace')) + soup = BeautifulSoup(raw.decode('cp1252', 'replace')) + return self.strip_anchors(soup) def postprocess_html(self,soup, True): + # Change class="kicker" to

kicker = soup.find(True, {'class':'kicker'}) if kicker is not None : h3Tag = Tag(soup, "h3") - h3Tag.insert(0, self.tag_to_string(kicker)) + h3Tag.insert(0, kicker.contents[0]) kicker.replaceWith(h3Tag) # Change captions to italic -1 for caption in soup.findAll(True, {'class':'caption'}) : if caption is not None: emTag = Tag(soup, "em") - #emTag['class'] = "caption" - #emTag['font-size-adjust'] = "-1" - emTag.insert(0, self.tag_to_string(caption)) + emTag.insert(0, caption.contents[0]) hrTag = Tag(soup, 'hr') emTag.insert(1, hrTag) caption.replaceWith(emTag) @@ -286,10 +294,10 @@ class NYTimes(BasicNewsRecipe): # Change to

headline = soup.find("nyt_headline") if headline is not None : - h2tag = Tag(soup, "h2") - h2tag['class'] = "headline" - h2tag.insert(0, self.tag_to_string(headline)) - headline.replaceWith(h2tag) + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, headline.contents[0]) + soup.h1.replaceWith(tag) # Change

to

- used in editorial blogs masthead = soup.find("h1") @@ -297,14 +305,34 @@ class NYTimes(BasicNewsRecipe): # Nuke the href if masthead.a is not None : del(masthead.a['href']) - h3tag = Tag(soup, "h3") - h3tag.insert(0, self.tag_to_string(masthead)) - masthead.replaceWith(h3tag) + tag = Tag(soup, "h3") + tag.insert(0, masthead.contents[0]) + soup.h1.replaceWith(tag) # Change to for subhead in soup.findAll(True, {'class':'bold'}) : bTag = Tag(soup, "b") - bTag.insert(0, self.tag_to_string(subhead)) + bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) + + # Synthesize a section header + dsk = soup.find('meta', attrs={'name':'dsk'}) + if dsk is not None and dsk.has_key('content'): + hTag = Tag(soup,'h3') + hTag['class'] = 'section' + hTag.insert(0,NavigableString(dsk['content'])) + articleTag = soup.find(True, attrs={'id':'article'}) + articleTag.insert(0,hTag) + + # Add class="articleBody" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag is not None : + divTag['class'] = divTag['id'] + + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag is not None : + divTag['class'] = divTag['id'] return soup + diff --git a/src/calibre/web/feeds/recipes/recipe_slate.py b/src/calibre/web/feeds/recipes/recipe_slate.py index dae94573b0..93c37affd4 100644 --- a/src/calibre/web/feeds/recipes/recipe_slate.py +++ b/src/calibre/web/feeds/recipes/recipe_slate.py @@ -3,19 +3,19 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' -Fetches the last 7 days of featured articles from slate.com +calibre recipe for slate.com ''' -import re +import string, re, sys +from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag -class Slate(BasicNewsRecipe): +class PeriodicalNameHere(BasicNewsRecipe): # Method variables for customizing downloads title = 'Slate' - description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.' - __author__ = 'GRiker@hotmail.com' - language = _('English') + description = 'A general-interest publication offering analysis and commentary about politics, news and culture.' + __author__ = 'GRiker' max_articles_per_feed = 40 oldest_article = 7.0 recursions = 0 @@ -26,33 +26,58 @@ class Slate(BasicNewsRecipe): feeds = None no_stylesheets = True encoding = None + language = _('English') + + # Method variables for customizing feed parsing summary_length = 250 use_embedded_content = None # Method variables for pre/post processing of HTML - remove_tags = [ dict(name=['link','style']), - dict(id=['toolbox','site_navigation','article_bottom_tools_cntr', - 'article_bottom_tools','recommend_tab2','bottom_sponsored_links', - 'fray_article_discussion','bizbox_sponsored_links_bottom', - 'page_rightcol','top_banner','also_in_slate_bottom','articlefooter', - 'article_top_wedge','content-top','page-title', - 'block-today039s-business-press-archives','block-blog-roll', - 'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm', - 'service-links-bottom','comments','ft']), - dict(attrs={'class':['fray_article_links','clearing','nav', - 'service-links service-links-stack','yui-b last', - 'read-more-comments']})] - extra_css = '.headline {text-align:left;}\n\ - .byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\ - .dateline {text-align:left; height:0pt;}\n\ - .source {align:left;}\n\ - .credit {text-align:right;font-size:smaller;}\n' + preprocess_regexps = [ (re.compile(r'

Disclosure: Slate is owned by the Washington Post.*

', + re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'

Join the discussion about this story on.*

', + re.DOTALL|re.IGNORECASE), + lambda match: '') ] + match_regexps = [] + + # The second entry is for 'Big Money', which comes from a different site, uses different markup + keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}), + dict(attrs={ 'id':['content']}) ] + + # The second entry is for 'Big Money', which comes from a different site, uses different markup + remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper', + 'article_bottom_tools_cntr','fray_article_discussion', + 'fray_article_links','bottom_sponsored_links','author_bio', + 'bizbox_links_bottom','ris_links_wrapper','BOXXLE']}), + dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ] + + excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast'] + excludedTitleKeywords = ['Gabfest','Slate V','on Twitter'] + excludedAuthorKeywords = [] + excludedContentKeywords = ['http://twitter.com/Slate'] + + extra_css = '.headline {text-align:left;}\n\ + .byline {font-family: monospace; \ + text-align: left;\ + margin-bottom: 0px;}\n\ + .dateline {text-align: left; \ + font-size: smaller;\ + height: 0pt;}\n\ + .imagewrapper {text-align: center;}\n\ + .source {text-align: left;}\n\ + .credit {text-align: right;\ + font-size: smaller;}\n\ + .article_body {text-align: left;}\n' + + # Local variables to extend class baseURL = 'http://slate.com' section_dates = [] - + + # class extension methods def tag_to_strings(self, tag): if not tag: return '' @@ -68,16 +93,16 @@ class Slate(BasicNewsRecipe): strings.append(res) return strings + def extract_sections(self): soup = self.index_to_soup( self.baseURL ) - soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'}) soup = soup.find(True, attrs={'id':'toc_links_container'}) todays_section = soup.find(True, attrs={'class':'todaydateline'}) - self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) - self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) - + self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) + self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) + older_section_dates = soup.findAll(True, attrs={'class':'maindateline'}) for older_section in older_section_dates : self.section_dates.append(self.tag_to_string(older_section,use_alt=False)) @@ -90,19 +115,20 @@ class Slate(BasicNewsRecipe): sections = [] for section in section_lists : sections.append(section) - return sections - + def extract_section_articles(self, sections_html) : + # Find the containers with section content soup = self.index_to_soup(str(sections_html)) sections = soup.findAll('ul') + articles = {} key = None ans = [] - + for (i,section) in enumerate(sections) : - + # Get the section name if section.has_key('id') : key = self.section_dates[i] @@ -110,14 +136,10 @@ class Slate(BasicNewsRecipe): ans.append(key) else : continue - + # Get the section article_list article_list = section.findAll('li') - - excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast'] - excludedTitleKeywords = ['Gabfest','Slate V'] - excludedAuthorKeywords = ['Prudence'] - + # Extract the article attributes for article in article_list : bylines = self.tag_to_strings(article) @@ -128,10 +150,10 @@ class Slate(BasicNewsRecipe): author = None description = None pubdate = None - + if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 : description = "A summary of what's in the major U.S. newspapers." - + if len(bylines) == 3 : author = bylines[2].strip() author = re.sub('[\r][\n][\t][\t\t]','', author) @@ -142,7 +164,6 @@ class Slate(BasicNewsRecipe): if full_byline.find('major U.S. newspapers') > 0 : description = "A summary of what's in the major U.S. newspapers." - if len(bylines) > 3 and author is not None: author += " | " for (i,substring) in enumerate(bylines[3:]) : @@ -152,38 +173,41 @@ class Slate(BasicNewsRecipe): author += " | " # Skip articles whose descriptions contain excluded keywords - if description is not None : - excluded = re.compile('|'.join(excludedDescriptionKeywords)) + if description is not None and len(self.excludedDescriptionKeywords): + excluded = re.compile('|'.join(self.excludedDescriptionKeywords)) found_excluded = excluded.search(description) if found_excluded : + if self.verbose : self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) continue # Skip articles whose title contain excluded keywords - if full_title is not None : - excluded = re.compile('|'.join(excludedTitleKeywords)) + if full_title is not None and len(self.excludedTitleKeywords): + excluded = re.compile('|'.join(self.excludedTitleKeywords)) #self.log("evaluating full_title: %s" % full_title) found_excluded = excluded.search(full_title) if found_excluded : + if self.verbose : self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) continue # Skip articles whose author contain excluded keywords - if author is not None : - excluded = re.compile('|'.join(excludedAuthorKeywords)) + if author is not None and len(self.excludedAuthorKeywords): + excluded = re.compile('|'.join(self.excludedAuthorKeywords)) found_excluded = excluded.search(author) if found_excluded : + if self.verbose : self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) continue - skip_this_article = False + skip_this_article = False # Check to make sure we're not adding a duplicate for article in articles[key] : if article['url'] == url : skip_this_article = True break - + if skip_this_article : continue - # Build the dictionary entry for this article + # Build the dictionary entry for this article feed = key if not articles.has_key(feed) : articles[feed] = [] @@ -194,28 +218,34 @@ class Slate(BasicNewsRecipe): if article['description'] is not None : if article['description'].find('newspapers') > 0 : articles[feed].insert(0,articles[feed].pop(i)) - + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - ans = self.remove_duplicates(ans) + ans = self.remove_duplicates(ans) return ans - + def flatten_document(self, ans): flat_articles = [] for (i,section) in enumerate(ans) : + #self.log("flattening section %s: " % section[0]) for article in section[1] : + #self.log("moving %s to flat_articles[]" % article['title']) flat_articles.append(article) flat_section = ['All Articles', flat_articles] - flat_ans = [flat_section] - + flat_ans = [flat_section] return flat_ans - + def remove_duplicates(self, ans): + # Return a stripped ans for (i,section) in enumerate(ans) : + #self.log("section %s: " % section[0]) for article in section[1] : + #self.log("\t%s" % article['title']) + #self.log("\looking for %s" % article['url']) for (j,subsequent_section) in enumerate(ans[i+1:]) : for (k,subsequent_article) in enumerate(subsequent_section[1]) : if article['url'] == subsequent_article['url'] : + #self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) ) del subsequent_section[1][k] return ans @@ -226,21 +256,80 @@ class Slate(BasicNewsRecipe): def parse_index(self) : sections = self.extract_sections() section_list = self.extract_section_articles(sections) - section_list = self.flatten_document(section_list) + section_list = self.flatten_document(section_list) return section_list + + def get_browser(self) : + return BasicNewsRecipe.get_browser() + def stripAnchors(self,soup): + body = soup.find('div',attrs={'id':['article_body','content']}) + if body is not None: + paras = body.findAll('p') + if paras is not None: + for para in paras: + aTags = para.findAll('a') + if aTags is not None: + for a in aTags: + if a.img is None: + #print repr(a.renderContents()) + a.replaceWith(a.renderContents().decode('utf-8','replace')) + return soup + def preprocess_html(self, soup) : + + # Remove 'grayPlus4.png' images + imgs = soup.findAll('img') + if imgs is not None: + for img in imgs: + if re.search("grayPlus4.png",str(img)): + img.extract() + + # Delete article based upon content keywords + if len(self.excludedDescriptionKeywords): + excluded = re.compile('|'.join(self.excludedContentKeywords)) + found_excluded = excluded.search(str(soup)) + if found_excluded : + return None + + # Articles from www.thebigmoney.com use different tagging for byline, dateline and body + head = soup.find('head') + if head.link is not None and re.search('www\.thebigmoney\.com', str(head)): + byline = soup.find('div',attrs={'id':'byline'}) + if byline is not None: + byline['class'] = byline['id'] + + dateline = soup.find('div',attrs={'id':'dateline'}) + if dateline is not None: + dateline['class'] = dateline['id'] + + body = soup.find('div',attrs={'id':'content'}) + if body is not None: + body['class'] = 'article_body' + + # Synthesize a department kicker + h3Tag = Tag(soup,'h3') + emTag = Tag(soup,'em') + emTag.insert(0,NavigableString("the big money: Today's business press")) + h3Tag.insert(0,emTag) + soup.body.insert(0,h3Tag) + + # Strip anchors from HTML + return self.stripAnchors(soup) + def postprocess_html(self, soup, first_fetch) : + # Fix up dept_kicker as

- dept_kicker = soup.find(True, attrs={'class':'department_kicker'}) + dept_kicker = soup.find('div', attrs={'class':'department_kicker'}) if dept_kicker is not None : kicker_strings = self.tag_to_strings(dept_kicker) - kicker = kicker_strings[2] + kicker_strings[3] - kicker = re.sub('.','',kicker) + #kicker = kicker_strings[2] + kicker_strings[3] + kicker = ''.join(kicker_strings[2:]) + kicker = re.sub('\.','',kicker) h3Tag = Tag(soup, "h3") emTag = Tag(soup, "em") + emTag.insert(0,NavigableString(kicker)) h3Tag.insert(0, emTag) - emTag.insert(0,kicker) dept_kicker.replaceWith(h3Tag) # Change

to

@@ -258,17 +347,19 @@ class Slate(BasicNewsRecipe): headline.replaceWith(h2tag) # Fix up the concatenated byline and dateline - byline = soup.find(True,attrs={'class':'byline'}) + byline = soup.find(True,attrs={'class':'byline'}) if byline is not None : bylineTag = Tag(soup,'div') bylineTag['class'] = 'byline' + #bylineTag['height'] = '0em' bylineTag.insert(0,self.tag_to_string(byline)) byline.replaceWith(bylineTag) - + dateline = soup.find(True, attrs={'class':'dateline'}) if dateline is not None : datelineTag = Tag(soup, 'div') datelineTag['class'] = 'dateline' + #datelineTag['margin-top'] = '0em' datelineTag.insert(0,self.tag_to_string(dateline)) dateline.replaceWith(datelineTag) @@ -280,51 +371,56 @@ class Slate(BasicNewsRecipe): hrTag = Tag(soup, 'hr') emTag.insert(1, hrTag) caption.replaceWith(emTag) - + + # Fix photos + for photo in soup.findAll('span',attrs={'class':'imagewrapper'}): + if photo.a is not None and photo.a.img is not None: + divTag = Tag(soup,'div') + divTag['class'] ='imagewrapper' + divTag.insert(0,photo.a.img) + photo.replaceWith(divTag) + return soup - + def postprocess_book(self, oeb, opts, log) : def extract_byline(href) : - soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) - byline = soup.find(True,attrs={'class':'byline'}) - if byline is not None: + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + byline = soup.find(True,attrs={'class':'byline'}) + if byline is not None: return self.tag_to_string(byline,use_alt=False) else : - return None - + return None + def extract_description(href) : soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) paragraphs = soup.findAll('p') for p in paragraphs : if self.tag_to_string(p,use_alt=False).startswith('By ') or \ self.tag_to_string(p,use_alt=False).startswith('Posted '): + continue + comment = p.find(text=lambda text:isinstance(text, Comment)) + if comment is not None: continue - - images = p.findAll(True, attrs={'class':'imagewrapper'}) - for image in images : - image.extract() - return self.tag_to_string(p,use_alt=False)[:200] + '...' - + else: + return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...' + return None - + + # Method entry point here + # Single section toc looks different than multi-section tocs if oeb.toc.depth() == 2 : for article in oeb.toc : if article.author is None : article.author = extract_byline(article.href) - if article.description is None : article.description = extract_description(article.href) - - elif oeb.toc.depth() == 3 : for section in oeb.toc : for article in section : if article.author is None : article.author = extract_byline(article.href) - if article.description is None : article.description = extract_description(article.href) - - - + + \ No newline at end of file