Improved recipes for The BBC, Slate, and NYT Headlines

This commit is contained in:
Kovid Goyal 2009-08-05 12:22:26 -06:00
parent d72d615e5c
commit 22003b492c
3 changed files with 286 additions and 137 deletions

View File

@ -10,23 +10,34 @@ from calibre.web.feeds.news import BasicNewsRecipe
class BBC(BasicNewsRecipe):
title = u'The BBC'
__author__ = 'Kovid Goyal and Sujata Raman'
__author__ = 'Kovid Goyal ans Sujata Raman'
description = 'Global news and current affairs from the British Broadcasting Corporation'
language = _('English')
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':'footer'}),
{'id' : ['popstory','blq-footer']},
{'class' : ['arrup','links','relatedbbcsites','arr','promobottombg','bbccom_visibility_hidden', 'sharesb', 'sib606', 'mvtb', 'storyextra', 'sidebar1', 'bbccom_text','promotopbg', 'gppromo','promotopbg','bbccom_display_none']},
]
remove_tags = [dict(name='div', attrs={'class':'footer'}),]
keep_only_tags = [dict(name='div', attrs={'class':'mainwrapper'})]
extra_css = '''
body{font-family:Arial,Helvetica,sans-serif; font-size:small;}
body{font-family:Arial,Helvetica,sans-serif; font-size:small; align:left}
h1{font-size:large;}
.sh{font-size:large; font-weight:bold}
.cap{font-size:xx-small; }
.lu{font-size:xx-small; }
.ds{font-size:xx-small; }
.mvb{font-size:xx-small;}
.by1{font-size:x-small; color:#666666}
.byd{font-size:x-small;}
'''
feeds = [
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
('Enterntainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
@ -38,8 +49,22 @@ class BBC(BasicNewsRecipe):
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
]
def postprocess_html(self, soup, first):
def print_version(self, url):
return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
for tag in soup.findAll(name= 'img', alt=""):
tag.extract()
for item in soup.findAll(align = "right"):
del item['align']
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
return soup
# def print_version(self, url):
# return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')

View File

@ -8,7 +8,7 @@ nytimes.com
import re
from calibre import entity_to_unicode
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
class NYTimes(BasicNewsRecipe):
@ -42,36 +42,39 @@ class NYTimes(BasicNewsRecipe):
# By default, no sections are skipped.
excludeSectionKeywords = []
# To skip sections containing the word 'Sports' or 'Dining', use:
# Add section keywords from the right column above to skip that section
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
# excludeSectionKeywords = ['Sports', 'Dining']
# Fetch only Business and Technology
#excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# Fetch only Top Stories
#excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# The maximum number of articles that will be downloaded
max_articles_per_feed = 50
max_articles_per_feed = 40
timefmt = ''
needs_subscription = True
remove_tags_after = dict(attrs={'id':['comments']})
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
'columnGroup','entry-meta','entry-response module','jumpLink','nav',
'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
'adxLeaderboard']),
dict(name=['script', 'noscript', 'style','hr'])]
keep_only_tags = [ dict(attrs={ 'id':['article']})]
remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
'inlineVideo left brightcove']}),
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
'portfolioInline','articleInline','readerscomment']}) ]
encoding = 'cp1252'
no_stylesheets = True
extra_css = '.headline {text-align:left;}\n\
.byline {font:monospace; margin-bottom:0px;}\n\
.source {align:left;}\n\
.credit {text-align:right;font-size:smaller;}\n'
extra_css = '.headline {text-align: left;}\n \
.byline {font-family: monospace; \
text-align: left; \
margin-bottom: 0px;}\n \
.timestamp {font-size: smaller;}\n \
.source {text-align: left;}\n \
.image {text-align: center;}\n \
.credit {text-align: right; \
font-size: smaller;}\n \
.articleBody {text-align: left;}\n \
.authorId {text-align: left; \
font-style: italic;}\n '
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@ -113,6 +116,8 @@ class NYTimes(BasicNewsRecipe):
if docEncoding == '' :
docEncoding = self.encoding
if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
@ -189,7 +194,6 @@ class NYTimes(BasicNewsRecipe):
key = self.sections[section]
excluded = re.compile('|'.join(self.excludeSectionKeywords))
if excluded.search(key) or articles.has_key(key):
if self.verbose : self.log("Skipping section %s" % key)
skipThisSection = True
break
@ -200,8 +204,7 @@ class NYTimes(BasicNewsRecipe):
# Extract the bylines and descriptions
if (i.string is not None) and \
(i.string.strip() > "") and \
not ('Comment' in str(i.__class__)) :
not isinstance(i,Comment):
contentString = i.strip().encode('utf-8')
if contentString[0:3] == 'By ' :
bylines.append(contentString)
@ -212,8 +215,6 @@ class NYTimes(BasicNewsRecipe):
articleCount = len(sectionblock.findAll('span'))
for (i,span) in enumerate(sectionblock.findAll('span')) :
a = span.find('a', href=True)
#if not a:
#continue
url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all'
@ -234,15 +235,13 @@ class NYTimes(BasicNewsRecipe):
# Check for duplicates
duplicateFound = False
if len(articles[feed]) > 1:
#print articles[feed]
for article in articles[feed] :
#print "comparing %s\n %s\n" % (url, article['url'])
if url == article['url'] :
duplicateFound = True
break
#print
if duplicateFound:
# Continue fetching, don't add this article
continue
if not articles.has_key(feed):
@ -252,33 +251,42 @@ class NYTimes(BasicNewsRecipe):
description=description, author=author, content=''))
ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
return self.strip_anchors(soup)
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker is not None :
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, self.tag_to_string(kicker))
h3Tag.insert(0, kicker.contents[0])
kicker.replaceWith(h3Tag)
# Change captions to italic -1
for caption in soup.findAll(True, {'class':'caption'}) :
if caption is not None:
emTag = Tag(soup, "em")
#emTag['class'] = "caption"
#emTag['font-size-adjust'] = "-1"
emTag.insert(0, self.tag_to_string(caption))
emTag.insert(0, caption.contents[0])
hrTag = Tag(soup, 'hr')
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
@ -286,10 +294,10 @@ class NYTimes(BasicNewsRecipe):
# Change <nyt_headline> to <h2>
headline = soup.find("nyt_headline")
if headline is not None :
h2tag = Tag(soup, "h2")
h2tag['class'] = "headline"
h2tag.insert(0, self.tag_to_string(headline))
headline.replaceWith(h2tag)
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, headline.contents[0])
soup.h1.replaceWith(tag)
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
@ -297,14 +305,34 @@ class NYTimes(BasicNewsRecipe):
# Nuke the href
if masthead.a is not None :
del(masthead.a['href'])
h3tag = Tag(soup, "h3")
h3tag.insert(0, self.tag_to_string(masthead))
masthead.replaceWith(h3tag)
tag = Tag(soup, "h3")
tag.insert(0, masthead.contents[0])
soup.h1.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
bTag = Tag(soup, "b")
bTag.insert(0, self.tag_to_string(subhead))
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
# Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk is not None and dsk.has_key('content'):
hTag = Tag(soup,'h3')
hTag['class'] = 'section'
hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'})
articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag is not None :
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag is not None :
divTag['class'] = divTag['id']
return soup

View File

@ -3,19 +3,19 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetches the last 7 days of featured articles from slate.com
calibre recipe for slate.com
'''
import re
import string, re, sys
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
class Slate(BasicNewsRecipe):
class PeriodicalNameHere(BasicNewsRecipe):
# Method variables for customizing downloads
title = 'Slate'
description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
__author__ = 'GRiker@hotmail.com'
language = _('English')
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
__author__ = 'GRiker'
max_articles_per_feed = 40
oldest_article = 7.0
recursions = 0
@ -26,33 +26,58 @@ class Slate(BasicNewsRecipe):
feeds = None
no_stylesheets = True
encoding = None
language = _('English')
# Method variables for customizing feed parsing
summary_length = 250
use_embedded_content = None
# Method variables for pre/post processing of HTML
remove_tags = [ dict(name=['link','style']),
dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
'fray_article_discussion','bizbox_sponsored_links_bottom',
'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
'article_top_wedge','content-top','page-title',
'block-today039s-business-press-archives','block-blog-roll',
'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
'service-links-bottom','comments','ft']),
dict(attrs={'class':['fray_article_links','clearing','nav',
'service-links service-links-stack','yui-b last',
'read-more-comments']})]
extra_css = '.headline {text-align:left;}\n\
.byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\
.dateline {text-align:left; height:0pt;}\n\
.source {align:left;}\n\
.credit {text-align:right;font-size:smaller;}\n'
preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>',
re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>',
re.DOTALL|re.IGNORECASE),
lambda match: '') ]
match_regexps = []
# The second entry is for 'Big Money', which comes from a different site, uses different markup
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
dict(attrs={ 'id':['content']}) ]
# The second entry is for 'Big Money', which comes from a different site, uses different markup
remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
'article_bottom_tools_cntr','fray_article_discussion',
'fray_article_links','bottom_sponsored_links','author_bio',
'bizbox_links_bottom','ris_links_wrapper','BOXXLE']}),
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
excludedTitleKeywords = ['Gabfest','Slate V','on Twitter']
excludedAuthorKeywords = []
excludedContentKeywords = ['http://twitter.com/Slate']
extra_css = '.headline {text-align:left;}\n\
.byline {font-family: monospace; \
text-align: left;\
margin-bottom: 0px;}\n\
.dateline {text-align: left; \
font-size: smaller;\
height: 0pt;}\n\
.imagewrapper {text-align: center;}\n\
.source {text-align: left;}\n\
.credit {text-align: right;\
font-size: smaller;}\n\
.article_body {text-align: left;}\n'
# Local variables to extend class
baseURL = 'http://slate.com'
section_dates = []
# class extension methods
def tag_to_strings(self, tag):
if not tag:
return ''
@ -68,16 +93,16 @@ class Slate(BasicNewsRecipe):
strings.append(res)
return strings
def extract_sections(self):
soup = self.index_to_soup( self.baseURL )
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
soup = soup.find(True, attrs={'id':'toc_links_container'})
todays_section = soup.find(True, attrs={'class':'todaydateline'})
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
for older_section in older_section_dates :
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
@ -90,19 +115,20 @@ class Slate(BasicNewsRecipe):
sections = []
for section in section_lists :
sections.append(section)
return sections
def extract_section_articles(self, sections_html) :
# Find the containers with section content
soup = self.index_to_soup(str(sections_html))
sections = soup.findAll('ul')
articles = {}
key = None
ans = []
for (i,section) in enumerate(sections) :
# Get the section name
if section.has_key('id') :
key = self.section_dates[i]
@ -110,14 +136,10 @@ class Slate(BasicNewsRecipe):
ans.append(key)
else :
continue
# Get the section article_list
article_list = section.findAll('li')
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
excludedTitleKeywords = ['Gabfest','Slate V']
excludedAuthorKeywords = ['Prudence']
# Extract the article attributes
for article in article_list :
bylines = self.tag_to_strings(article)
@ -128,10 +150,10 @@ class Slate(BasicNewsRecipe):
author = None
description = None
pubdate = None
if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
description = "A summary of what's in the major U.S. newspapers."
if len(bylines) == 3 :
author = bylines[2].strip()
author = re.sub('[\r][\n][\t][\t\t]','', author)
@ -142,7 +164,6 @@ class Slate(BasicNewsRecipe):
if full_byline.find('major U.S. newspapers') > 0 :
description = "A summary of what's in the major U.S. newspapers."
if len(bylines) > 3 and author is not None:
author += " | "
for (i,substring) in enumerate(bylines[3:]) :
@ -152,38 +173,41 @@ class Slate(BasicNewsRecipe):
author += " | "
# Skip articles whose descriptions contain excluded keywords
if description is not None :
excluded = re.compile('|'.join(excludedDescriptionKeywords))
if description is not None and len(self.excludedDescriptionKeywords):
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
found_excluded = excluded.search(description)
if found_excluded :
if self.verbose : self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
# Skip articles whose title contain excluded keywords
if full_title is not None :
excluded = re.compile('|'.join(excludedTitleKeywords))
if full_title is not None and len(self.excludedTitleKeywords):
excluded = re.compile('|'.join(self.excludedTitleKeywords))
#self.log("evaluating full_title: %s" % full_title)
found_excluded = excluded.search(full_title)
if found_excluded :
if self.verbose : self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
# Skip articles whose author contain excluded keywords
if author is not None :
excluded = re.compile('|'.join(excludedAuthorKeywords))
if author is not None and len(self.excludedAuthorKeywords):
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
found_excluded = excluded.search(author)
if found_excluded :
if self.verbose : self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
skip_this_article = False
skip_this_article = False
# Check to make sure we're not adding a duplicate
for article in articles[key] :
if article['url'] == url :
skip_this_article = True
break
if skip_this_article :
continue
# Build the dictionary entry for this article
# Build the dictionary entry for this article
feed = key
if not articles.has_key(feed) :
articles[feed] = []
@ -194,28 +218,34 @@ class Slate(BasicNewsRecipe):
if article['description'] is not None :
if article['description'].find('newspapers') > 0 :
articles[feed].insert(0,articles[feed].pop(i))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
ans = self.remove_duplicates(ans)
ans = self.remove_duplicates(ans)
return ans
def flatten_document(self, ans):
flat_articles = []
for (i,section) in enumerate(ans) :
#self.log("flattening section %s: " % section[0])
for article in section[1] :
#self.log("moving %s to flat_articles[]" % article['title'])
flat_articles.append(article)
flat_section = ['All Articles', flat_articles]
flat_ans = [flat_section]
flat_ans = [flat_section]
return flat_ans
def remove_duplicates(self, ans):
# Return a stripped ans
for (i,section) in enumerate(ans) :
#self.log("section %s: " % section[0])
for article in section[1] :
#self.log("\t%s" % article['title'])
#self.log("\looking for %s" % article['url'])
for (j,subsequent_section) in enumerate(ans[i+1:]) :
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
if article['url'] == subsequent_article['url'] :
#self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
del subsequent_section[1][k]
return ans
@ -226,21 +256,80 @@ class Slate(BasicNewsRecipe):
def parse_index(self) :
sections = self.extract_sections()
section_list = self.extract_section_articles(sections)
section_list = self.flatten_document(section_list)
section_list = self.flatten_document(section_list)
return section_list
def get_browser(self) :
return BasicNewsRecipe.get_browser()
def stripAnchors(self,soup):
body = soup.find('div',attrs={'id':['article_body','content']})
if body is not None:
paras = body.findAll('p')
if paras is not None:
for para in paras:
aTags = para.findAll('a')
if aTags is not None:
for a in aTags:
if a.img is None:
#print repr(a.renderContents())
a.replaceWith(a.renderContents().decode('utf-8','replace'))
return soup
def preprocess_html(self, soup) :
# Remove 'grayPlus4.png' images
imgs = soup.findAll('img')
if imgs is not None:
for img in imgs:
if re.search("grayPlus4.png",str(img)):
img.extract()
# Delete article based upon content keywords
if len(self.excludedDescriptionKeywords):
excluded = re.compile('|'.join(self.excludedContentKeywords))
found_excluded = excluded.search(str(soup))
if found_excluded :
return None
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
head = soup.find('head')
if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
byline = soup.find('div',attrs={'id':'byline'})
if byline is not None:
byline['class'] = byline['id']
dateline = soup.find('div',attrs={'id':'dateline'})
if dateline is not None:
dateline['class'] = dateline['id']
body = soup.find('div',attrs={'id':'content'})
if body is not None:
body['class'] = 'article_body'
# Synthesize a department kicker
h3Tag = Tag(soup,'h3')
emTag = Tag(soup,'em')
emTag.insert(0,NavigableString("the big money: Today's business press"))
h3Tag.insert(0,emTag)
soup.body.insert(0,h3Tag)
# Strip anchors from HTML
return self.stripAnchors(soup)
def postprocess_html(self, soup, first_fetch) :
# Fix up dept_kicker as <h3><em>
dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
if dept_kicker is not None :
kicker_strings = self.tag_to_strings(dept_kicker)
kicker = kicker_strings[2] + kicker_strings[3]
kicker = re.sub('.','',kicker)
#kicker = kicker_strings[2] + kicker_strings[3]
kicker = ''.join(kicker_strings[2:])
kicker = re.sub('\.','',kicker)
h3Tag = Tag(soup, "h3")
emTag = Tag(soup, "em")
emTag.insert(0,NavigableString(kicker))
h3Tag.insert(0, emTag)
emTag.insert(0,kicker)
dept_kicker.replaceWith(h3Tag)
# Change <h1> to <h2>
@ -258,17 +347,19 @@ class Slate(BasicNewsRecipe):
headline.replaceWith(h2tag)
# Fix up the concatenated byline and dateline
byline = soup.find(True,attrs={'class':'byline'})
byline = soup.find(True,attrs={'class':'byline'})
if byline is not None :
bylineTag = Tag(soup,'div')
bylineTag['class'] = 'byline'
#bylineTag['height'] = '0em'
bylineTag.insert(0,self.tag_to_string(byline))
byline.replaceWith(bylineTag)
dateline = soup.find(True, attrs={'class':'dateline'})
if dateline is not None :
datelineTag = Tag(soup, 'div')
datelineTag['class'] = 'dateline'
#datelineTag['margin-top'] = '0em'
datelineTag.insert(0,self.tag_to_string(dateline))
dateline.replaceWith(datelineTag)
@ -280,51 +371,56 @@ class Slate(BasicNewsRecipe):
hrTag = Tag(soup, 'hr')
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Fix photos
for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
if photo.a is not None and photo.a.img is not None:
divTag = Tag(soup,'div')
divTag['class'] ='imagewrapper'
divTag.insert(0,photo.a.img)
photo.replaceWith(divTag)
return soup
def postprocess_book(self, oeb, opts, log) :
def extract_byline(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find(True,attrs={'class':'byline'})
if byline is not None:
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find(True,attrs={'class':'byline'})
if byline is not None:
return self.tag_to_string(byline,use_alt=False)
else :
return None
return None
def extract_description(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
paragraphs = soup.findAll('p')
for p in paragraphs :
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
self.tag_to_string(p,use_alt=False).startswith('Posted '):
continue
comment = p.find(text=lambda text:isinstance(text, Comment))
if comment is not None:
continue
images = p.findAll(True, attrs={'class':'imagewrapper'})
for image in images :
image.extract()
return self.tag_to_string(p,use_alt=False)[:200] + '...'
else:
return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
return None
# Method entry point here
# Single section toc looks different than multi-section tocs
if oeb.toc.depth() == 2 :
for article in oeb.toc :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)
elif oeb.toc.depth() == 3 :
for section in oeb.toc :
for article in section :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)