mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improved recipes for The BBC, Slate, and NYT Headlines
This commit is contained in:
parent
d72d615e5c
commit
22003b492c
@ -10,23 +10,34 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BBC(BasicNewsRecipe):
|
||||
title = u'The BBC'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
__author__ = 'Kovid Goyal ans Sujata Raman'
|
||||
description = 'Global news and current affairs from the British Broadcasting Corporation'
|
||||
language = _('English')
|
||||
no_stylesheets = True
|
||||
remove_tags = [dict(name='div', attrs={'class':'footer'}),
|
||||
{'id' : ['popstory','blq-footer']},
|
||||
{'class' : ['arrup','links','relatedbbcsites','arr','promobottombg','bbccom_visibility_hidden', 'sharesb', 'sib606', 'mvtb', 'storyextra', 'sidebar1', 'bbccom_text','promotopbg', 'gppromo','promotopbg','bbccom_display_none']},
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':'footer'}),]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'mainwrapper'})]
|
||||
|
||||
extra_css = '''
|
||||
body{font-family:Arial,Helvetica,sans-serif; font-size:small;}
|
||||
body{font-family:Arial,Helvetica,sans-serif; font-size:small; align:left}
|
||||
h1{font-size:large;}
|
||||
.sh{font-size:large; font-weight:bold}
|
||||
.cap{font-size:xx-small; }
|
||||
.lu{font-size:xx-small; }
|
||||
.ds{font-size:xx-small; }
|
||||
.mvb{font-size:xx-small;}
|
||||
.by1{font-size:x-small; color:#666666}
|
||||
.byd{font-size:x-small;}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
|
||||
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
|
||||
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
|
||||
('Enterntainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
|
||||
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
|
||||
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
|
||||
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
|
||||
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
|
||||
@ -38,8 +49,22 @@ class BBC(BasicNewsRecipe):
|
||||
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
|
||||
]
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
|
||||
for tag in soup.findAll(name= 'img', alt=""):
|
||||
tag.extract()
|
||||
|
||||
for item in soup.findAll(align = "right"):
|
||||
del item['align']
|
||||
|
||||
for tag in soup.findAll(name=['table', 'tr', 'td']):
|
||||
tag.name = 'div'
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
|
||||
# def print_version(self, url):
|
||||
# return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
|
||||
|
||||
|
||||
|
@ -8,7 +8,7 @@ nytimes.com
|
||||
import re
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
@ -42,36 +42,39 @@ class NYTimes(BasicNewsRecipe):
|
||||
# By default, no sections are skipped.
|
||||
excludeSectionKeywords = []
|
||||
|
||||
# To skip sections containing the word 'Sports' or 'Dining', use:
|
||||
# Add section keywords from the right column above to skip that section
|
||||
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
|
||||
# excludeSectionKeywords = ['Sports', 'Dining']
|
||||
|
||||
# Fetch only Business and Technology
|
||||
#excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
||||
|
||||
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
||||
# Fetch only Top Stories
|
||||
#excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
||||
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 50
|
||||
max_articles_per_feed = 40
|
||||
|
||||
timefmt = ''
|
||||
needs_subscription = True
|
||||
remove_tags_after = dict(attrs={'id':['comments']})
|
||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
|
||||
'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
|
||||
'columnGroup','entry-meta','entry-response module','jumpLink','nav',
|
||||
'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
|
||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
|
||||
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
|
||||
'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
|
||||
'adxLeaderboard']),
|
||||
dict(name=['script', 'noscript', 'style','hr'])]
|
||||
keep_only_tags = [ dict(attrs={ 'id':['article']})]
|
||||
remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
|
||||
'inlineVideo left brightcove']}),
|
||||
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
|
||||
'portfolioInline','articleInline','readerscomment']}) ]
|
||||
|
||||
encoding = 'cp1252'
|
||||
no_stylesheets = True
|
||||
extra_css = '.headline {text-align:left;}\n\
|
||||
.byline {font:monospace; margin-bottom:0px;}\n\
|
||||
.source {align:left;}\n\
|
||||
.credit {text-align:right;font-size:smaller;}\n'
|
||||
extra_css = '.headline {text-align: left;}\n \
|
||||
.byline {font-family: monospace; \
|
||||
text-align: left; \
|
||||
margin-bottom: 0px;}\n \
|
||||
.timestamp {font-size: smaller;}\n \
|
||||
.source {text-align: left;}\n \
|
||||
.image {text-align: center;}\n \
|
||||
.credit {text-align: right; \
|
||||
font-size: smaller;}\n \
|
||||
.articleBody {text-align: left;}\n \
|
||||
.authorId {text-align: left; \
|
||||
font-style: italic;}\n '
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
@ -113,6 +116,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
if docEncoding == '' :
|
||||
docEncoding = self.encoding
|
||||
|
||||
if self.verbose > 2:
|
||||
self.log( " document encoding: '%s'" % docEncoding)
|
||||
if docEncoding != self.encoding :
|
||||
soup = get_the_soup(docEncoding, url_or_raw)
|
||||
|
||||
@ -189,7 +194,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
key = self.sections[section]
|
||||
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
||||
if excluded.search(key) or articles.has_key(key):
|
||||
if self.verbose : self.log("Skipping section %s" % key)
|
||||
skipThisSection = True
|
||||
break
|
||||
|
||||
@ -200,8 +204,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
# Extract the bylines and descriptions
|
||||
if (i.string is not None) and \
|
||||
(i.string.strip() > "") and \
|
||||
not ('Comment' in str(i.__class__)) :
|
||||
|
||||
not isinstance(i,Comment):
|
||||
contentString = i.strip().encode('utf-8')
|
||||
if contentString[0:3] == 'By ' :
|
||||
bylines.append(contentString)
|
||||
@ -212,8 +215,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
articleCount = len(sectionblock.findAll('span'))
|
||||
for (i,span) in enumerate(sectionblock.findAll('span')) :
|
||||
a = span.find('a', href=True)
|
||||
#if not a:
|
||||
#continue
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
url += '?pagewanted=all'
|
||||
|
||||
@ -234,15 +235,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
# Check for duplicates
|
||||
duplicateFound = False
|
||||
if len(articles[feed]) > 1:
|
||||
#print articles[feed]
|
||||
for article in articles[feed] :
|
||||
#print "comparing %s\n %s\n" % (url, article['url'])
|
||||
if url == article['url'] :
|
||||
duplicateFound = True
|
||||
break
|
||||
#print
|
||||
|
||||
if duplicateFound:
|
||||
# Continue fetching, don't add this article
|
||||
continue
|
||||
|
||||
if not articles.has_key(feed):
|
||||
@ -253,32 +252,41 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
|
||||
return ans
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||
if refresh is None:
|
||||
return soup
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
content = refresh.get('content').partition('=')[2]
|
||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
# Change class="kicker" to <h3>
|
||||
kicker = soup.find(True, {'class':'kicker'})
|
||||
if kicker is not None :
|
||||
h3Tag = Tag(soup, "h3")
|
||||
h3Tag.insert(0, self.tag_to_string(kicker))
|
||||
h3Tag.insert(0, kicker.contents[0])
|
||||
kicker.replaceWith(h3Tag)
|
||||
|
||||
# Change captions to italic -1
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption is not None:
|
||||
emTag = Tag(soup, "em")
|
||||
#emTag['class'] = "caption"
|
||||
#emTag['font-size-adjust'] = "-1"
|
||||
emTag.insert(0, self.tag_to_string(caption))
|
||||
emTag.insert(0, caption.contents[0])
|
||||
hrTag = Tag(soup, 'hr')
|
||||
emTag.insert(1, hrTag)
|
||||
caption.replaceWith(emTag)
|
||||
@ -286,10 +294,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
# Change <nyt_headline> to <h2>
|
||||
headline = soup.find("nyt_headline")
|
||||
if headline is not None :
|
||||
h2tag = Tag(soup, "h2")
|
||||
h2tag['class'] = "headline"
|
||||
h2tag.insert(0, self.tag_to_string(headline))
|
||||
headline.replaceWith(h2tag)
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, headline.contents[0])
|
||||
soup.h1.replaceWith(tag)
|
||||
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
@ -297,14 +305,34 @@ class NYTimes(BasicNewsRecipe):
|
||||
# Nuke the href
|
||||
if masthead.a is not None :
|
||||
del(masthead.a['href'])
|
||||
h3tag = Tag(soup, "h3")
|
||||
h3tag.insert(0, self.tag_to_string(masthead))
|
||||
masthead.replaceWith(h3tag)
|
||||
tag = Tag(soup, "h3")
|
||||
tag.insert(0, masthead.contents[0])
|
||||
soup.h1.replaceWith(tag)
|
||||
|
||||
# Change <span class="bold"> to <b>
|
||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||
bTag = Tag(soup, "b")
|
||||
bTag.insert(0, self.tag_to_string(subhead))
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
|
||||
# Synthesize a section header
|
||||
dsk = soup.find('meta', attrs={'name':'dsk'})
|
||||
if dsk is not None and dsk.has_key('content'):
|
||||
hTag = Tag(soup,'h3')
|
||||
hTag['class'] = 'section'
|
||||
hTag.insert(0,NavigableString(dsk['content']))
|
||||
articleTag = soup.find(True, attrs={'id':'article'})
|
||||
articleTag.insert(0,hTag)
|
||||
|
||||
# Add class="articleBody" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag is not None :
|
||||
divTag['class'] = divTag['id']
|
||||
|
||||
# Add class="authorId" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||
if divTag is not None :
|
||||
divTag['class'] = divTag['id']
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -3,19 +3,19 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Fetches the last 7 days of featured articles from slate.com
|
||||
calibre recipe for slate.com
|
||||
'''
|
||||
|
||||
import re
|
||||
import string, re, sys
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
|
||||
|
||||
class Slate(BasicNewsRecipe):
|
||||
class PeriodicalNameHere(BasicNewsRecipe):
|
||||
# Method variables for customizing downloads
|
||||
title = 'Slate'
|
||||
description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
|
||||
__author__ = 'GRiker@hotmail.com'
|
||||
language = _('English')
|
||||
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
|
||||
__author__ = 'GRiker'
|
||||
max_articles_per_feed = 40
|
||||
oldest_article = 7.0
|
||||
recursions = 0
|
||||
@ -26,33 +26,58 @@ class Slate(BasicNewsRecipe):
|
||||
feeds = None
|
||||
no_stylesheets = True
|
||||
encoding = None
|
||||
language = _('English')
|
||||
|
||||
|
||||
|
||||
# Method variables for customizing feed parsing
|
||||
summary_length = 250
|
||||
use_embedded_content = None
|
||||
|
||||
# Method variables for pre/post processing of HTML
|
||||
remove_tags = [ dict(name=['link','style']),
|
||||
dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
|
||||
'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
|
||||
'fray_article_discussion','bizbox_sponsored_links_bottom',
|
||||
'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
|
||||
'article_top_wedge','content-top','page-title',
|
||||
'block-today039s-business-press-archives','block-blog-roll',
|
||||
'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
|
||||
'service-links-bottom','comments','ft']),
|
||||
dict(attrs={'class':['fray_article_links','clearing','nav',
|
||||
'service-links service-links-stack','yui-b last',
|
||||
'read-more-comments']})]
|
||||
extra_css = '.headline {text-align:left;}\n\
|
||||
.byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\
|
||||
.dateline {text-align:left; height:0pt;}\n\
|
||||
.source {align:left;}\n\
|
||||
.credit {text-align:right;font-size:smaller;}\n'
|
||||
preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>',
|
||||
re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
(re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>',
|
||||
re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '') ]
|
||||
|
||||
match_regexps = []
|
||||
|
||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
|
||||
dict(attrs={ 'id':['content']}) ]
|
||||
|
||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||
remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
|
||||
'article_bottom_tools_cntr','fray_article_discussion',
|
||||
'fray_article_links','bottom_sponsored_links','author_bio',
|
||||
'bizbox_links_bottom','ris_links_wrapper','BOXXLE']}),
|
||||
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
|
||||
|
||||
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
||||
excludedTitleKeywords = ['Gabfest','Slate V','on Twitter']
|
||||
excludedAuthorKeywords = []
|
||||
excludedContentKeywords = ['http://twitter.com/Slate']
|
||||
|
||||
extra_css = '.headline {text-align:left;}\n\
|
||||
.byline {font-family: monospace; \
|
||||
text-align: left;\
|
||||
margin-bottom: 0px;}\n\
|
||||
.dateline {text-align: left; \
|
||||
font-size: smaller;\
|
||||
height: 0pt;}\n\
|
||||
.imagewrapper {text-align: center;}\n\
|
||||
.source {text-align: left;}\n\
|
||||
.credit {text-align: right;\
|
||||
font-size: smaller;}\n\
|
||||
.article_body {text-align: left;}\n'
|
||||
|
||||
# Local variables to extend class
|
||||
baseURL = 'http://slate.com'
|
||||
section_dates = []
|
||||
|
||||
# class extension methods
|
||||
def tag_to_strings(self, tag):
|
||||
if not tag:
|
||||
return ''
|
||||
@ -68,9 +93,9 @@ class Slate(BasicNewsRecipe):
|
||||
strings.append(res)
|
||||
return strings
|
||||
|
||||
|
||||
def extract_sections(self):
|
||||
soup = self.index_to_soup( self.baseURL )
|
||||
|
||||
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
|
||||
soup = soup.find(True, attrs={'id':'toc_links_container'})
|
||||
|
||||
@ -90,13 +115,14 @@ class Slate(BasicNewsRecipe):
|
||||
sections = []
|
||||
for section in section_lists :
|
||||
sections.append(section)
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def extract_section_articles(self, sections_html) :
|
||||
# Find the containers with section content
|
||||
soup = self.index_to_soup(str(sections_html))
|
||||
sections = soup.findAll('ul')
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
@ -114,10 +140,6 @@ class Slate(BasicNewsRecipe):
|
||||
# Get the section article_list
|
||||
article_list = section.findAll('li')
|
||||
|
||||
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
||||
excludedTitleKeywords = ['Gabfest','Slate V']
|
||||
excludedAuthorKeywords = ['Prudence']
|
||||
|
||||
# Extract the article attributes
|
||||
for article in article_list :
|
||||
bylines = self.tag_to_strings(article)
|
||||
@ -142,7 +164,6 @@ class Slate(BasicNewsRecipe):
|
||||
if full_byline.find('major U.S. newspapers') > 0 :
|
||||
description = "A summary of what's in the major U.S. newspapers."
|
||||
|
||||
|
||||
if len(bylines) > 3 and author is not None:
|
||||
author += " | "
|
||||
for (i,substring) in enumerate(bylines[3:]) :
|
||||
@ -152,25 +173,28 @@ class Slate(BasicNewsRecipe):
|
||||
author += " | "
|
||||
|
||||
# Skip articles whose descriptions contain excluded keywords
|
||||
if description is not None :
|
||||
excluded = re.compile('|'.join(excludedDescriptionKeywords))
|
||||
if description is not None and len(self.excludedDescriptionKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
|
||||
found_excluded = excluded.search(description)
|
||||
if found_excluded :
|
||||
if self.verbose : self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
# Skip articles whose title contain excluded keywords
|
||||
if full_title is not None :
|
||||
excluded = re.compile('|'.join(excludedTitleKeywords))
|
||||
if full_title is not None and len(self.excludedTitleKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedTitleKeywords))
|
||||
#self.log("evaluating full_title: %s" % full_title)
|
||||
found_excluded = excluded.search(full_title)
|
||||
if found_excluded :
|
||||
if self.verbose : self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
# Skip articles whose author contain excluded keywords
|
||||
if author is not None :
|
||||
excluded = re.compile('|'.join(excludedAuthorKeywords))
|
||||
if author is not None and len(self.excludedAuthorKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
|
||||
found_excluded = excluded.search(author)
|
||||
if found_excluded :
|
||||
if self.verbose : self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
skip_this_article = False
|
||||
@ -203,19 +227,25 @@ class Slate(BasicNewsRecipe):
|
||||
def flatten_document(self, ans):
|
||||
flat_articles = []
|
||||
for (i,section) in enumerate(ans) :
|
||||
#self.log("flattening section %s: " % section[0])
|
||||
for article in section[1] :
|
||||
#self.log("moving %s to flat_articles[]" % article['title'])
|
||||
flat_articles.append(article)
|
||||
flat_section = ['All Articles', flat_articles]
|
||||
flat_ans = [flat_section]
|
||||
|
||||
return flat_ans
|
||||
|
||||
def remove_duplicates(self, ans):
|
||||
# Return a stripped ans
|
||||
for (i,section) in enumerate(ans) :
|
||||
#self.log("section %s: " % section[0])
|
||||
for article in section[1] :
|
||||
#self.log("\t%s" % article['title'])
|
||||
#self.log("\looking for %s" % article['url'])
|
||||
for (j,subsequent_section) in enumerate(ans[i+1:]) :
|
||||
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
|
||||
if article['url'] == subsequent_article['url'] :
|
||||
#self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
|
||||
del subsequent_section[1][k]
|
||||
return ans
|
||||
|
||||
@ -229,18 +259,77 @@ class Slate(BasicNewsRecipe):
|
||||
section_list = self.flatten_document(section_list)
|
||||
return section_list
|
||||
|
||||
def get_browser(self) :
|
||||
return BasicNewsRecipe.get_browser()
|
||||
|
||||
def stripAnchors(self,soup):
|
||||
body = soup.find('div',attrs={'id':['article_body','content']})
|
||||
if body is not None:
|
||||
paras = body.findAll('p')
|
||||
if paras is not None:
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
if aTags is not None:
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
#print repr(a.renderContents())
|
||||
a.replaceWith(a.renderContents().decode('utf-8','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup) :
|
||||
|
||||
# Remove 'grayPlus4.png' images
|
||||
imgs = soup.findAll('img')
|
||||
if imgs is not None:
|
||||
for img in imgs:
|
||||
if re.search("grayPlus4.png",str(img)):
|
||||
img.extract()
|
||||
|
||||
# Delete article based upon content keywords
|
||||
if len(self.excludedDescriptionKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedContentKeywords))
|
||||
found_excluded = excluded.search(str(soup))
|
||||
if found_excluded :
|
||||
return None
|
||||
|
||||
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
|
||||
head = soup.find('head')
|
||||
if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
|
||||
byline = soup.find('div',attrs={'id':'byline'})
|
||||
if byline is not None:
|
||||
byline['class'] = byline['id']
|
||||
|
||||
dateline = soup.find('div',attrs={'id':'dateline'})
|
||||
if dateline is not None:
|
||||
dateline['class'] = dateline['id']
|
||||
|
||||
body = soup.find('div',attrs={'id':'content'})
|
||||
if body is not None:
|
||||
body['class'] = 'article_body'
|
||||
|
||||
# Synthesize a department kicker
|
||||
h3Tag = Tag(soup,'h3')
|
||||
emTag = Tag(soup,'em')
|
||||
emTag.insert(0,NavigableString("the big money: Today's business press"))
|
||||
h3Tag.insert(0,emTag)
|
||||
soup.body.insert(0,h3Tag)
|
||||
|
||||
# Strip anchors from HTML
|
||||
return self.stripAnchors(soup)
|
||||
|
||||
def postprocess_html(self, soup, first_fetch) :
|
||||
|
||||
# Fix up dept_kicker as <h3><em>
|
||||
dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
|
||||
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
|
||||
if dept_kicker is not None :
|
||||
kicker_strings = self.tag_to_strings(dept_kicker)
|
||||
kicker = kicker_strings[2] + kicker_strings[3]
|
||||
kicker = re.sub('.','',kicker)
|
||||
#kicker = kicker_strings[2] + kicker_strings[3]
|
||||
kicker = ''.join(kicker_strings[2:])
|
||||
kicker = re.sub('\.','',kicker)
|
||||
h3Tag = Tag(soup, "h3")
|
||||
emTag = Tag(soup, "em")
|
||||
emTag.insert(0,NavigableString(kicker))
|
||||
h3Tag.insert(0, emTag)
|
||||
emTag.insert(0,kicker)
|
||||
dept_kicker.replaceWith(h3Tag)
|
||||
|
||||
# Change <h1> to <h2>
|
||||
@ -262,6 +351,7 @@ class Slate(BasicNewsRecipe):
|
||||
if byline is not None :
|
||||
bylineTag = Tag(soup,'div')
|
||||
bylineTag['class'] = 'byline'
|
||||
#bylineTag['height'] = '0em'
|
||||
bylineTag.insert(0,self.tag_to_string(byline))
|
||||
byline.replaceWith(bylineTag)
|
||||
|
||||
@ -269,6 +359,7 @@ class Slate(BasicNewsRecipe):
|
||||
if dateline is not None :
|
||||
datelineTag = Tag(soup, 'div')
|
||||
datelineTag['class'] = 'dateline'
|
||||
#datelineTag['margin-top'] = '0em'
|
||||
datelineTag.insert(0,self.tag_to_string(dateline))
|
||||
dateline.replaceWith(datelineTag)
|
||||
|
||||
@ -281,6 +372,14 @@ class Slate(BasicNewsRecipe):
|
||||
emTag.insert(1, hrTag)
|
||||
caption.replaceWith(emTag)
|
||||
|
||||
# Fix photos
|
||||
for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
|
||||
if photo.a is not None and photo.a.img is not None:
|
||||
divTag = Tag(soup,'div')
|
||||
divTag['class'] ='imagewrapper'
|
||||
divTag.insert(0,photo.a.img)
|
||||
photo.replaceWith(divTag)
|
||||
|
||||
return soup
|
||||
|
||||
def postprocess_book(self, oeb, opts, log) :
|
||||
@ -300,31 +399,28 @@ class Slate(BasicNewsRecipe):
|
||||
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
|
||||
self.tag_to_string(p,use_alt=False).startswith('Posted '):
|
||||
continue
|
||||
|
||||
images = p.findAll(True, attrs={'class':'imagewrapper'})
|
||||
for image in images :
|
||||
image.extract()
|
||||
return self.tag_to_string(p,use_alt=False)[:200] + '...'
|
||||
comment = p.find(text=lambda text:isinstance(text, Comment))
|
||||
if comment is not None:
|
||||
continue
|
||||
else:
|
||||
return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
|
||||
|
||||
return None
|
||||
|
||||
# Method entry point here
|
||||
# Single section toc looks different than multi-section tocs
|
||||
if oeb.toc.depth() == 2 :
|
||||
for article in oeb.toc :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
|
||||
elif oeb.toc.depth() == 3 :
|
||||
for section in oeb.toc :
|
||||
for article in section :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user