mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
1208 lines
58 KiB
Python
1208 lines
58 KiB
Python
#!/usr/bin/env python2
|
||
# -*- coding: utf-8 -*-
|
||
__license__ = 'GPL v3'
|
||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||
'''
|
||
nytimes.com
|
||
'''
|
||
import re, string, time
|
||
from calibre import strftime
|
||
from datetime import timedelta, date
|
||
from time import sleep
|
||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||
|
||
class NYTimes(BasicNewsRecipe):
|
||
|
||
recursions=1 # set this to zero to omit Related articles lists
|
||
match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
|
||
|
||
# set getTechBlogs to True to include the technology blogs
|
||
# set tech_oldest_article to control article age
|
||
# set tech_max_articles_per_feed to control article count
|
||
getTechBlogs = True
|
||
remove_empty_feeds = True
|
||
tech_oldest_article = 14
|
||
tech_max_articles_per_feed = 25
|
||
|
||
# set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
|
||
# otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
|
||
getPopularArticles = True
|
||
popularPeriod = '1' # set this to the number of days to include in the measurement
|
||
# e.g. 7 will get the most popular measured over the last 7 days
|
||
# and 30 will get the most popular measured over 30 days.
|
||
# you still only get up to 20 articles in each category
|
||
|
||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||
headlinesOnly = True
|
||
|
||
# set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the
|
||
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
||
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
||
webEdition = False
|
||
oldest_web_article = None
|
||
|
||
# download higher resolution images than the small thumbnails typically included in the article
|
||
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
||
useHighResImages = True
|
||
compress_news_images = True
|
||
compress_news_images_auto_size = 5
|
||
|
||
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||
replaceKindleVersion = False
|
||
|
||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||
# Otherwise, only the sections named will be included. For example,
|
||
#
|
||
# includeSections = ['Politics','Sports']
|
||
#
|
||
# would cause only the Politics and Sports sections to be included.
|
||
|
||
includeSections = [] # by default, all sections included
|
||
|
||
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
|
||
# Otherwise, the sections named will be excluded. For example,
|
||
#
|
||
# excludeSections = ['Politics','Sports']
|
||
#
|
||
# would cause the Politics and Sports sections to be excluded. This parameter can be used
|
||
# in conjuction with includeSections although in most cases using one or the other, but
|
||
# not both, is sufficient.
|
||
|
||
excludeSections = []
|
||
|
||
# one_picture_per_article specifies that calibre should only use the first image
|
||
# from an article (if one exists). If one_picture_per_article = True, the image
|
||
# will be moved to a location between the headline and the byline.
|
||
# If one_picture_per_article = False, all images from the article will be included
|
||
# and shown in their original location.
|
||
one_picture_per_article = False
|
||
|
||
# The maximum number of articles that will be downloaded
|
||
max_articles_per_feed = 100
|
||
use_embedded_content = False
|
||
|
||
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||
# more than one section). If True, only the first occurance will be downloaded.
|
||
filterDuplicates = True
|
||
|
||
# Sections to collect for the Web edition.
|
||
# Delete any you don't want, or use includeSections or excludeSections
|
||
web_sections = [(u'World',u'world'),
|
||
(u'U.S.',u'national'),
|
||
(u'Politics',u'politics'),
|
||
(u'New York',u'nyregion'),
|
||
(u'Business','business'),
|
||
(u'Technology',u'technology'),
|
||
(u'Sports',u'sports'),
|
||
(u'Science',u'science'),
|
||
(u'Health',u'health'),
|
||
(u'Opinion',u'opinion'),
|
||
(u'Arts',u'arts'),
|
||
(u'Books',u'books'),
|
||
(u'Movies',u'movies'),
|
||
(u'Music',u'arts/music'),
|
||
(u'Television',u'arts/television'),
|
||
(u'Style',u'style'),
|
||
(u'Dining & Wine',u'dining'),
|
||
(u'Fashion & Style',u'fashion'),
|
||
(u'Home & Garden',u'garden'),
|
||
(u'Travel',u'travel'),
|
||
('Education',u'education'),
|
||
('Multimedia',u'multimedia'),
|
||
(u'Obituaries',u'obituaries'),
|
||
(u'Sunday Magazine',u'magazine')
|
||
]
|
||
|
||
tech_feeds = [
|
||
(u'Tech - News', u'http://pogue.blogs.nytimes.com/feed/'),
|
||
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
|
||
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
||
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||
]
|
||
|
||
if headlinesOnly:
|
||
title='New York Times Headlines'
|
||
description = 'Headlines from the New York Times'
|
||
needs_subscription = False
|
||
elif webEdition:
|
||
title='New York Times (Web)'
|
||
description = 'New York Times on the Web'
|
||
needs_subscription = False
|
||
elif replaceKindleVersion:
|
||
title='The New York Times'
|
||
description = 'Today\'s New York Times'
|
||
needs_subscription = False
|
||
else:
|
||
title='New York Times'
|
||
description = 'Today\'s New York Times'
|
||
needs_subscription = False
|
||
|
||
def decode_url_date(self,url):
|
||
urlitems = url.split('/')
|
||
try:
|
||
d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5]))
|
||
except:
|
||
try:
|
||
d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6]))
|
||
except:
|
||
return None
|
||
return d
|
||
|
||
if oldest_web_article is None:
|
||
earliest_date = date.today()
|
||
else:
|
||
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||
oldest_article = 365 # by default, a long time ago
|
||
|
||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||
language = 'en'
|
||
requires_version = (0, 7, 5)
|
||
encoding = 'utf-8'
|
||
|
||
timefmt = ''
|
||
|
||
# simultaneous_downloads = 1 # no longer required to deal with ads
|
||
|
||
cover_margins = (18,18,'grey99')
|
||
|
||
keep_only_tags = dict(id=['article', 'story', 'content'])
|
||
remove_tags = [
|
||
dict(attrs={'class':[
|
||
'articleFooter',
|
||
'articleTools',
|
||
'rfd', 'story-footer-links', 'page-footer',
|
||
'columnGroup singleRule',
|
||
'columnGroup last',
|
||
'columnGroup last',
|
||
'doubleRule',
|
||
'dottedLine',
|
||
'entry-meta',
|
||
'entry-response module',
|
||
'leftNavTabs',
|
||
'metaFootnote',
|
||
'inside-story',
|
||
'module box nav',
|
||
'nextArticleLink',
|
||
'nextArticleLink clearfix',
|
||
'post-tools',
|
||
'relatedSearchesModule',
|
||
'side_tool',
|
||
'singleAd',
|
||
'postCategory column',
|
||
'refer tagRefer', # added for bits blog post
|
||
'entry entry-utility', # added for DealBook
|
||
'entry-tags', # added for DealBook
|
||
'footer promos clearfix', # added for DealBook
|
||
'footer links clearfix', # added for DealBook
|
||
'tabsContainer', # added for other blog downloads
|
||
'column lastColumn', # added for other blog downloads
|
||
'pageHeaderWithLabel', # added for other gadgetwise downloads
|
||
'column two', # added for other blog downloads
|
||
'column two last', # added for other blog downloads
|
||
'column three', # added for other blog downloads
|
||
'column three last', # added for other blog downloads
|
||
'column four', # added for other blog downloads
|
||
'column four last', # added for other blog downloads
|
||
'column last', # added for other blog downloads
|
||
'entry entry-related',
|
||
'subNavigation tabContent active', # caucus blog navigation
|
||
'mediaOverlay slideshow',
|
||
'wideThumb',
|
||
'video', # added 02-11-2011
|
||
'videoHeader', # added 02-11-2011
|
||
'articleInlineVideoHolder', # added 02-11-2011
|
||
'assetCompanionAd',
|
||
'nytint-sectionHeader',
|
||
re.compile('^subNavigation'),
|
||
re.compile('^leaderboard'),
|
||
re.compile('^module'),
|
||
re.compile('commentCount'),
|
||
'lede-container',
|
||
'credit'
|
||
]}),
|
||
dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
|
||
dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
|
||
dict(attrs={'class':lambda x: x and 'sharetools' in x.split()}),
|
||
dict(attrs={'class':lambda x: x and 'ad' in x.split()}),
|
||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||
dict(name='div', attrs={'class':'tweet'}),
|
||
dict(name='span', attrs={'class':'commentCount meta'}),
|
||
dict(name='div', attrs={'id':'header'}),
|
||
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
||
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
||
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
||
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
||
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
||
dict(id=[
|
||
'adxLeaderboard',
|
||
'pagelinks',
|
||
'adxSponLink',
|
||
'anchoredAd_module',
|
||
'anchoredAd_spot',
|
||
'archive',
|
||
'articleExtras',
|
||
'articleInline',
|
||
'blog_sidebar',
|
||
'businessSearchBar',
|
||
'cCol',
|
||
'entertainmentSearchBar',
|
||
'footer',
|
||
'header',
|
||
'header_search',
|
||
'inlineBox',
|
||
'login',
|
||
'masthead',
|
||
'masthead-nav',
|
||
'memberTools',
|
||
'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge',
|
||
'page-footer',
|
||
'portfolioInline',
|
||
'readerReviews',
|
||
'readerReviewsCount',
|
||
'relatedArticles',
|
||
'relatedTopics',
|
||
'respond',
|
||
'ribbon',
|
||
'side_search',
|
||
'side_index',
|
||
'side_tool',
|
||
'toolsRight',
|
||
'skybox', # added for DealBook
|
||
'TopAd', # added for DealBook
|
||
'related-content', # added for DealBook
|
||
'whats-next',
|
||
]),
|
||
dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
|
||
no_stylesheets = True
|
||
extra_css = '''
|
||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||
.credit { font-weight: normal; text-align: right; font-size:
|
||
50%; line-height:1em; margin-top:5px; margin-left:0;
|
||
margin-right:0; margin-bottom: 0; }
|
||
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||
.timestamp { font-weight: normal; text-align: left; font-size: 50%; }
|
||
.caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||
a:link {text-decoration: none; }
|
||
.date{font-size: 50%; }
|
||
.update{font-size: 50%; }
|
||
.articleBody { }
|
||
.authorId {text-align: left; font-size: 50%; }
|
||
.image {text-align: center;}
|
||
.aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
|
||
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
||
.source {text-align: left; font-size: x-small; }'''
|
||
|
||
articles = {}
|
||
key = None
|
||
ans = []
|
||
url_list = []
|
||
|
||
def filter_ans(self, ans) :
|
||
total_article_count = 0
|
||
idx = 0
|
||
idx_max = len(ans)-1
|
||
while idx <= idx_max:
|
||
if self.includeSections != []:
|
||
if ans[idx][0] not in self.includeSections:
|
||
print "SECTION NOT INCLUDED: ",ans[idx][0]
|
||
del ans[idx]
|
||
idx_max = idx_max-1
|
||
continue
|
||
if ans[idx][0] in self.excludeSections:
|
||
print "SECTION EXCLUDED: ",ans[idx][0]
|
||
del ans[idx]
|
||
idx_max = idx_max-1
|
||
continue
|
||
if True: # self.verbose
|
||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
|
||
for article in ans[idx][1]:
|
||
total_article_count += 1
|
||
if True: # self.verbose
|
||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||
article['url'].encode('cp1252','replace')))
|
||
idx = idx+1
|
||
|
||
self.log("Queued %d articles" % total_article_count)
|
||
return ans
|
||
|
||
def exclude_url(self,url):
|
||
if not url.startswith("http"):
|
||
return True
|
||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: # added for DealBook
|
||
return True
|
||
if 'nytimes.com' not in url:
|
||
return True
|
||
if 'podcast' in url:
|
||
return True
|
||
if '/video/' in url:
|
||
return True
|
||
if '/multimedia/' in url:
|
||
return True
|
||
if '/slideshow/' in url:
|
||
return True
|
||
if '/magazine/index' in url:
|
||
return True
|
||
if '/interactive/' in url:
|
||
return True
|
||
if '/reference/' in url:
|
||
return True
|
||
if '/premium/' in url:
|
||
return True
|
||
if '#comment' in url:
|
||
return True
|
||
if '#postComment' in url:
|
||
return True
|
||
if '#postcomment' in url:
|
||
return True
|
||
if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None:
|
||
print("NO DATE IN "+url)
|
||
return True
|
||
return False
|
||
|
||
def fixChars(self,string):
|
||
# Replace lsquo (\x91)
|
||
fixed = re.sub("\x91","‘",string)
|
||
|
||
# Replace rsquo (\x92)
|
||
fixed = re.sub("\x92","’",fixed)
|
||
|
||
# Replace ldquo (\x93)
|
||
fixed = re.sub("\x93","“",fixed)
|
||
|
||
# Replace rdquo (\x94)
|
||
fixed = re.sub("\x94","”",fixed)
|
||
|
||
# Replace ndash (\x96)
|
||
fixed = re.sub("\x96","–",fixed)
|
||
|
||
# Replace mdash (\x97)
|
||
fixed = re.sub("\x97","—",fixed)
|
||
|
||
return fixed
|
||
|
||
def get_browser(self):
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
return br
|
||
|
||
cover_tag = 'NY_NYT'
|
||
def get_cover_url(self):
|
||
from datetime import timedelta, date
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
daysback=1
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
while daysback<7:
|
||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
|
||
br = BasicNewsRecipe.get_browser(self)
|
||
try:
|
||
br.open(cover)
|
||
except:
|
||
daysback = daysback+1
|
||
continue
|
||
break
|
||
if daysback==7:
|
||
self.log("\nCover unavailable")
|
||
cover = None
|
||
return cover
|
||
|
||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||
|
||
def short_title(self):
|
||
return self.title
|
||
|
||
def article_to_soup(self, url_or_raw, raw=False):
|
||
from contextlib import closing
|
||
import copy
|
||
from calibre.ebooks.chardet import xml_to_unicode
|
||
print("ARTICLE_TO_SOUP "+url_or_raw)
|
||
if re.match(r'\w+://', url_or_raw):
|
||
br = self.clone_browser(self.browser)
|
||
open_func = getattr(br, 'open_novisit', br.open)
|
||
with closing(open_func(url_or_raw)) as f:
|
||
_raw = f.read()
|
||
if not _raw:
|
||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||
else:
|
||
_raw = url_or_raw
|
||
if raw:
|
||
return _raw
|
||
if not isinstance(_raw, unicode) and self.encoding:
|
||
if callable(self.encoding):
|
||
_raw = self.encoding(_raw)
|
||
else:
|
||
_raw = _raw.decode(self.encoding, 'replace')
|
||
|
||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||
nmassage.extend(self.preprocess_regexps)
|
||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]
|
||
# Some websites have buggy doctype declarations that mess up beautifulsoup
|
||
# Remove comments as they can leave detritus when extracting tags leaves
|
||
# multiple nested comments
|
||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||
usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0]
|
||
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
||
return BeautifulSoup(usrc, markupMassage=nmassage)
|
||
|
||
def massageNCXText(self, description):
|
||
# Kindle TOC descriptions won't render certain characters
|
||
if description:
|
||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||
# Replace '&' with '&'
|
||
massaged = re.sub("&","&", massaged)
|
||
massaged = re.sub("&","&", massaged)
|
||
return self.fixChars(massaged)
|
||
else:
|
||
return description
|
||
|
||
def feed_title(self,div):
|
||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||
|
||
def handle_article(self,div):
|
||
thumbnail = div.find('div','thumbnail')
|
||
if thumbnail:
|
||
thumbnail.extract()
|
||
a = div.find('a', href=True)
|
||
if not a:
|
||
return
|
||
url = re.sub(r'\?.*', '', a['href'])
|
||
if self.exclude_url(url):
|
||
return
|
||
url += '?pagewanted=all'
|
||
if self.filterDuplicates:
|
||
if url in self.url_list:
|
||
return
|
||
if self.webEdition:
|
||
date_tag = self.decode_url_date(url)
|
||
if date_tag is not None:
|
||
if self.oldest_web_article is not None:
|
||
if date_tag < self.earliest_date:
|
||
self.log("Skipping article %s" % url)
|
||
return
|
||
else:
|
||
self.log("Skipping article %s" % url)
|
||
return
|
||
self.url_list.append(url)
|
||
title = self.tag_to_string(a, use_alt=True).strip()
|
||
description = ''
|
||
pubdate = strftime('%a, %d %b')
|
||
summary = div.find(True, attrs={'class':'summary'})
|
||
if summary:
|
||
description = self.tag_to_string(summary, use_alt=False)
|
||
author = ''
|
||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||
if authorAttribution:
|
||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||
else:
|
||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||
if authorAttribution:
|
||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||
feed = self.key if self.key is not None else 'Uncategorized'
|
||
if feed not in self.articles:
|
||
self.ans.append(feed)
|
||
self.articles[feed] = []
|
||
self.articles[feed].append(
|
||
dict(title=title, url=url, date=pubdate,
|
||
description=description, author=author,
|
||
content=''))
|
||
|
||
def get_popular_articles(self,ans):
|
||
if self.getPopularArticles:
|
||
popular_articles = {}
|
||
key_list = []
|
||
|
||
def handleh3(h3tag):
|
||
try:
|
||
url = h3tag.a['href']
|
||
except:
|
||
return ('','','','')
|
||
url = re.sub(r'\?.*', '', url)
|
||
if self.exclude_url(url):
|
||
return ('','','','')
|
||
url += '?pagewanted=all'
|
||
title = self.tag_to_string(h3tag.a,False)
|
||
h6tag = h3tag.findNextSibling('h6')
|
||
if h6tag is not None:
|
||
author = self.tag_to_string(h6tag,False)
|
||
else:
|
||
author = ''
|
||
ptag = h3tag.findNextSibling('p')
|
||
if ptag is not None:
|
||
desc = self.tag_to_string(ptag,False)
|
||
else:
|
||
desc = ''
|
||
return(title,url,author,desc)
|
||
|
||
have_emailed = False
|
||
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
|
||
for h3tag in emailed_soup.findAll('h3'):
|
||
(title,url,author,desc) = handleh3(h3tag)
|
||
if url=='':
|
||
continue
|
||
if not have_emailed:
|
||
key_list.append('Most E-Mailed')
|
||
popular_articles['Most E-Mailed'] = []
|
||
have_emailed = True
|
||
popular_articles['Most E-Mailed'].append(
|
||
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||
description=desc, author=author,
|
||
content=''))
|
||
have_viewed = False
|
||
viewed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-viewed?period='+self.popularPeriod)
|
||
for h3tag in viewed_soup.findAll('h3'):
|
||
(title,url,author,desc) = handleh3(h3tag)
|
||
if url=='':
|
||
continue
|
||
if not have_viewed:
|
||
key_list.append('Most Viewed')
|
||
popular_articles['Most Viewed'] = []
|
||
have_viewed = True
|
||
popular_articles['Most Viewed'].append(
|
||
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||
description=desc, author=author,
|
||
content=''))
|
||
viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
|
||
for x in viewed_ans:
|
||
ans.append(x)
|
||
return ans
|
||
|
||
def get_tech_feeds(self,ans):
|
||
if self.getTechBlogs:
|
||
tech_articles = {}
|
||
key_list = []
|
||
save_oldest_article = self.oldest_article
|
||
save_max_articles_per_feed = self.max_articles_per_feed
|
||
self.oldest_article = self.tech_oldest_article
|
||
self.max_articles_per_feed = self.tech_max_articles_per_feed
|
||
self.feeds = self.tech_feeds
|
||
tech = self.parse_feeds()
|
||
self.oldest_article = save_oldest_article
|
||
self.max_articles_per_feed = save_max_articles_per_feed
|
||
self.feeds = None
|
||
for f in tech:
|
||
key_list.append(f.title)
|
||
tech_articles[f.title] = []
|
||
for a in f.articles:
|
||
tech_articles[f.title].append(
|
||
dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
|
||
description=a.summary, author=a.author,
|
||
content=a.content))
|
||
tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
|
||
for x in tech_ans:
|
||
ans.append(x)
|
||
return ans
|
||
|
||
def parse_web_edition(self):
|
||
|
||
for (sec_title,index_url) in self.web_sections:
|
||
if self.includeSections != []:
|
||
if sec_title not in self.includeSections:
|
||
print "SECTION NOT INCLUDED: ",sec_title
|
||
continue
|
||
if sec_title in self.excludeSections:
|
||
print "SECTION EXCLUDED: ",sec_title
|
||
continue
|
||
try:
|
||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||
except:
|
||
continue
|
||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||
|
||
self.key = sec_title
|
||
# Find each article
|
||
for div in soup.findAll(True,
|
||
attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||
if div['class'] in ['story', 'story headline', 'storyHeader'] :
|
||
self.handle_article(div)
|
||
elif div['class'] == 'ledeStory':
|
||
divsub = div.find('div','storyHeader')
|
||
if divsub is not None:
|
||
self.handle_article(divsub)
|
||
ulrefer = div.find('ul','refer')
|
||
if ulrefer is not None:
|
||
for lidiv in ulrefer.findAll('li'):
|
||
self.handle_article(lidiv)
|
||
elif div['class'] == 'headlinesOnly multiline flush':
|
||
for lidiv in div.findAll('li'):
|
||
self.handle_article(lidiv)
|
||
|
||
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||
|
||
def parse_todays_index(self):
|
||
|
||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||
skipping = False
|
||
# Find each article
|
||
for div in soup.findAll(True,
|
||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||
if div['class'] in ['section-headline','sectionHeader']:
|
||
self.key = string.capwords(self.feed_title(div))
|
||
self.key = self.key.replace('Op-ed','Op-Ed')
|
||
self.key = self.key.replace('U.s.','U.S.')
|
||
self.key = self.key.replace('N.y.','N.Y.')
|
||
skipping = False
|
||
if self.includeSections != []:
|
||
if self.key not in self.includeSections:
|
||
print "SECTION NOT INCLUDED: ",self.key
|
||
skipping = True
|
||
if self.key in self.excludeSections:
|
||
print "SECTION EXCLUDED: ",self.key
|
||
skipping = True
|
||
|
||
elif div['class'] in ['story', 'story headline'] :
|
||
if not skipping:
|
||
self.handle_article(div)
|
||
elif div['class'] == 'headlinesOnly multiline flush':
|
||
for lidiv in div.findAll('li'):
|
||
if not skipping:
|
||
self.handle_article(lidiv)
|
||
|
||
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||
|
||
def parse_headline_index(self):
|
||
|
||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||
|
||
section_name='Unknown Section'
|
||
pubdate = strftime('%a, %d %b')
|
||
for td_col in soup.findAll('td'):
|
||
h6_sec_name = td_col.find('h6')
|
||
if h6_sec_name is not None:
|
||
new_section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||
new_section_name = re.sub(r'^ *$','',new_section_name)
|
||
if new_section_name == '':
|
||
continue
|
||
section_name = new_section_name
|
||
continue
|
||
atag = td_col.find('a')
|
||
if atag is not None:
|
||
h4tag = None
|
||
for h4tag in atag.findNextSiblings('h4'):
|
||
break
|
||
if h4tag is None:
|
||
continue
|
||
author = self.tag_to_string(h4tag,use_alt=False)
|
||
try:
|
||
url = re.sub(r'\?.*', '', atag['href'])
|
||
except:
|
||
continue
|
||
if self.exclude_url(url):
|
||
continue
|
||
if '?' in url:
|
||
url += '&pagewanted=all'
|
||
else:
|
||
url += '?pagewanted=all'
|
||
if self.filterDuplicates:
|
||
if url in self.url_list:
|
||
continue
|
||
self.url_list.append(url)
|
||
title = self.tag_to_string(atag, use_alt=False).strip()
|
||
desc = atag.parent.find('p')
|
||
if desc is not None:
|
||
description = self.tag_to_string(desc,use_alt=False)
|
||
else:
|
||
description = ''
|
||
if section_name not in self.articles:
|
||
self.ans.append(section_name)
|
||
self.articles[section_name] = []
|
||
print('Title '+title+' author '+author)
|
||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||
|
||
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||
return self.filter_ans(self.ans)
|
||
|
||
def parse_index(self):
|
||
if self.headlinesOnly:
|
||
return self.parse_headline_index()
|
||
elif self.webEdition:
|
||
return self.parse_web_edition()
|
||
else:
|
||
return self.parse_todays_index()
|
||
|
||
def strip_anchors(self,soup,kill_all=False):
|
||
paras = soup.findAll(True)
|
||
for para in paras:
|
||
aTags = para.findAll('a')
|
||
for a in aTags:
|
||
if a.img is None:
|
||
if kill_all or (self.recursions==0):
|
||
a.replaceWith(self.tag_to_string(a,False))
|
||
else:
|
||
if 'href' in a:
|
||
if a['href'].startswith('http://www.nytimes'):
|
||
if not a['href'].endswith('pagewanted=all'):
|
||
url = re.sub(r'\?.*', '', a['href'])
|
||
if self.exclude_url(url):
|
||
a.replaceWith(self.tag_to_string(a,False))
|
||
else:
|
||
a['href'] = url+'?pagewanted=all'
|
||
elif not (a['href'].startswith('http://pogue') or
|
||
a['href'].startswith('http://bits') or
|
||
a['href'].startswith('http://travel') or
|
||
a['href'].startswith('http://business') or
|
||
a['href'].startswith('http://tech') or
|
||
a['href'].startswith('http://health') or
|
||
a['href'].startswith('http://dealbook') or
|
||
a['href'].startswith('http://open')):
|
||
a.replaceWith(self.tag_to_string(a,False))
|
||
return soup
|
||
|
||
def handle_tags(self,soup):
|
||
try:
|
||
print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title))
|
||
except:
|
||
print("HANDLE TAGS: NO TITLE")
|
||
if soup is None:
|
||
print("ERROR: handle_tags received NoneType")
|
||
return None
|
||
|
||
## print("HANDLING AD FORWARD:")
|
||
# print(soup)
|
||
if self.keep_only_tags:
|
||
body = Tag(soup, 'body')
|
||
try:
|
||
if isinstance(self.keep_only_tags, dict):
|
||
self.keep_only_tags = [self.keep_only_tags]
|
||
for spec in self.keep_only_tags:
|
||
for tag in soup.find('body').findAll(**spec):
|
||
body.insert(len(body.contents), tag)
|
||
soup.find('body').replaceWith(body)
|
||
except AttributeError: # soup has no body element
|
||
pass
|
||
|
||
def remove_beyond(tag, next):
|
||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||
after = getattr(tag, next)
|
||
while after is not None:
|
||
ns = getattr(tag, next)
|
||
after.extract()
|
||
after = ns
|
||
tag = tag.parent
|
||
|
||
if self.remove_tags_after is not None:
|
||
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||
for spec in rt:
|
||
tag = soup.find(**spec)
|
||
remove_beyond(tag, 'nextSibling')
|
||
|
||
if self.remove_tags_before is not None:
|
||
tag = soup.find(**self.remove_tags_before)
|
||
remove_beyond(tag, 'previousSibling')
|
||
|
||
for kwds in self.remove_tags:
|
||
for tag in soup.findAll(**kwds):
|
||
tag.extract()
|
||
|
||
return soup
|
||
|
||
def preprocess_html(self, soup):
|
||
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||
skip_tag = soup.find(True, {'name':'skip'})
|
||
if skip_tag is not None:
|
||
#url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||
url = 'http://www.nytimes.com' + skip_tag.parent['href']
|
||
#url += '?pagewanted=all'
|
||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||
sleep(5)
|
||
soup = self.handle_tags(self.article_to_soup(url))
|
||
|
||
# check if the article is from one of the tech blogs
|
||
blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']})
|
||
|
||
if blog is not None:
|
||
old_body = soup.find('body')
|
||
new_body=Tag(soup,'body')
|
||
new_body.append(soup.find('div',attrs={'id':'content'}))
|
||
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
||
old_body.replaceWith(new_body)
|
||
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
||
if divr.find(text=re.compile('Sign up')):
|
||
divr.extract()
|
||
divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')})
|
||
if divr is not None:
|
||
print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False))
|
||
# handle related articles
|
||
rlist = []
|
||
ul = divr.find('ul')
|
||
if ul is not None:
|
||
for li in ul.findAll('li'):
|
||
atag = li.find('a')
|
||
if atag is not None:
|
||
if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \
|
||
atag['href'].startswith('http://open'):
|
||
atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False)))
|
||
rlist.append(atag)
|
||
divr.extract()
|
||
if rlist != []:
|
||
asidediv = Tag(soup,'div',[('class','aside')])
|
||
if soup.find('hr') is None:
|
||
asidediv.append(Tag(soup,'hr'))
|
||
h4 = Tag(soup,'h4',[('class','asidenote')])
|
||
h4.insert(0,"Related Posts")
|
||
asidediv.append(h4)
|
||
ul = Tag(soup,'ul')
|
||
for r in rlist:
|
||
li = Tag(soup,'li',[('class','aside')])
|
||
r['class'] = 'aside'
|
||
li.append(r)
|
||
ul.append(li)
|
||
asidediv.append(ul)
|
||
asidediv.append(Tag(soup,'hr'))
|
||
smain = soup.find('body')
|
||
smain.append(asidediv)
|
||
else:
|
||
print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False))
|
||
for atag in soup.findAll('a'):
|
||
img = atag.find('img')
|
||
if img is not None:
|
||
atag.replaceWith(img)
|
||
elif 'href' not in atag:
|
||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
|
||
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||
hdr = soup.find('address')
|
||
if hdr is not None:
|
||
hdr.name='span'
|
||
for span_credit in soup.findAll('span','credit'):
|
||
sp = Tag(soup,'span')
|
||
span_credit.replaceWith(sp)
|
||
sp.append(Tag(soup,'br'))
|
||
sp.append(span_credit)
|
||
sp.append(Tag(soup,'br'))
|
||
|
||
else: # nytimes article
|
||
|
||
related = [] # these will be the related articles
|
||
first_outer = None # first related outer tag
|
||
first_related = None # first related tag
|
||
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
||
if rdiv.find('h3') is not None:
|
||
if self.tag_to_string(rdiv.h3,False).startswith('Related'):
|
||
rdiv.h3.find(text=True).replaceWith("Related articles")
|
||
rdiv.h3['class'] = 'asidenote'
|
||
for litag in rdiv.findAll('li'):
|
||
if litag.find('a') is not None:
|
||
if litag.find('a')['href'].startswith('http://www.nytimes.com'):
|
||
url = re.sub(r'\?.*', '', litag.find('a')['href'])
|
||
litag.find('a')['href'] = url+'?pagewanted=all'
|
||
litag.extract()
|
||
related.append(litag)
|
||
if first_related is None:
|
||
first_related = rdiv
|
||
first_outer = outerdiv
|
||
else:
|
||
litag.extract()
|
||
for h6tag in rdiv.findAll('h6'):
|
||
if h6tag.find('a') is not None:
|
||
if h6tag.find('a')['href'].startswith('http://www.nytimes.com'):
|
||
url = re.sub(r'\?.*', '', h6tag.find('a')['href'])
|
||
h6tag.find('a')['href'] = url+'?pagewanted=all'
|
||
h6tag.extract()
|
||
related.append(h6tag)
|
||
if first_related is None:
|
||
first_related = rdiv
|
||
first_outer = outerdiv
|
||
else:
|
||
h6tag.extract()
|
||
if related != []:
|
||
for r in related:
|
||
if r.h6: # don't want the anchor inside a h6 tag
|
||
r.h6.replaceWith(r.h6.a)
|
||
first_related.ul.append(r)
|
||
first_related.insert(0,Tag(soup,'hr'))
|
||
first_related.append(Tag(soup,'hr'))
|
||
first_related['class'] = 'aside'
|
||
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
||
|
||
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||
rdiv.extract()
|
||
|
||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||
if kicker_tag: # remove Op_Ed author head shots
|
||
tagline = self.tag_to_string(kicker_tag)
|
||
if tagline=='Op-Ed Columnist':
|
||
img_div = soup.find('div','inlineImage module')
|
||
if img_div:
|
||
img_div.extract()
|
||
|
||
if self.useHighResImages:
|
||
try:
|
||
# open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
|
||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||
if enlargeThisList:
|
||
for popupref in enlargeThisList:
|
||
popupreflink = popupref.find('a')
|
||
if popupreflink:
|
||
reflinkstring = str(popupreflink['href'])
|
||
refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
|
||
refend = reflinkstring.find(".html", refstart) + len(".html")
|
||
reflinkstring = reflinkstring[refstart:refend]
|
||
|
||
popuppage = self.browser.open(reflinkstring)
|
||
popuphtml = popuppage.read()
|
||
popuppage.close()
|
||
if popuphtml:
|
||
st = time.localtime()
|
||
year = str(st.tm_year)
|
||
month = "%.2d" % st.tm_mon
|
||
day = "%.2d" % st.tm_mday
|
||
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + \
|
||
len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
|
||
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
|
||
month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
|
||
popupSoup = BeautifulSoup(popuphtml)
|
||
highResTag = popupSoup.find('img', {'src':highResImageLink})
|
||
if highResTag:
|
||
try:
|
||
newWidth = highResTag['width']
|
||
newHeight = highResTag['height']
|
||
imageTag = popupref.parent.find("img")
|
||
except:
|
||
self.log("Error: finding width and height of img")
|
||
popupref.extract()
|
||
if imageTag:
|
||
try:
|
||
imageTag['src'] = highResImageLink
|
||
imageTag['width'] = newWidth
|
||
imageTag['height'] = newHeight
|
||
except:
|
||
self.log("Error setting the src width and height parameters")
|
||
except Exception:
|
||
self.log("Error pulling high resolution images")
|
||
|
||
try:
|
||
# in case pulling images failed, delete the enlarge this text
|
||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||
if enlargeThisList:
|
||
for popupref in enlargeThisList:
|
||
popupref.extract()
|
||
except:
|
||
self.log("Error removing Enlarge this text")
|
||
|
||
return self.strip_anchors(soup,False)
|
||
|
||
def postprocess_html(self,soup,first_fetch):
|
||
if not first_fetch: # remove Related links
|
||
for aside in soup.findAll('div','aside'):
|
||
aside.extract()
|
||
soup = self.strip_anchors(soup,True)
|
||
#print("RECURSIVE: "+self.tag_to_string(soup.title))
|
||
|
||
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
||
if first_fetch:
|
||
aside = soup.find('div','aside')
|
||
if aside is not None: # move the related list to the end of the article
|
||
art = soup.find('div',attrs={'id':'article'})
|
||
if art is None:
|
||
art = soup.find('div',attrs={'class':'article'})
|
||
if art is not None:
|
||
art.append(aside)
|
||
try:
|
||
if self.one_picture_per_article:
|
||
# Remove all images after first
|
||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
||
if largeImg:
|
||
for inlineImg in inlineImgs:
|
||
inlineImg.extract()
|
||
else:
|
||
if inlineImgs:
|
||
firstImg = inlineImgs[0]
|
||
for inlineImg in inlineImgs[1:]:
|
||
inlineImg.extract()
|
||
# Move firstImg before article body
|
||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||
if cgFirst:
|
||
# Strip all sibling NavigableStrings: noise
|
||
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||
[ns.extract() for ns in navstrings]
|
||
headline_found = False
|
||
tag = cgFirst.find(True)
|
||
insertLoc = 0
|
||
while True:
|
||
insertLoc += 1
|
||
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
||
headline_found = True
|
||
break
|
||
tag = tag.nextSibling
|
||
if not tag:
|
||
headline_found = False
|
||
break
|
||
if headline_found:
|
||
cgFirst.insert(insertLoc,firstImg)
|
||
else:
|
||
self.log(">>> No class:'columnGroup first' found <<<")
|
||
except:
|
||
self.log("ERROR: One picture per article in postprocess_html")
|
||
|
||
try:
|
||
# Change captions to italic
|
||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||
if caption and len(caption) > 0:
|
||
cTag = Tag(soup, "p", [("class", "caption")])
|
||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||
mp_off = c.find("More Photos")
|
||
if mp_off >= 0:
|
||
c = c[:mp_off]
|
||
cTag.insert(0, c)
|
||
caption.replaceWith(cTag)
|
||
except:
|
||
self.log("ERROR: Problem in change captions to italic")
|
||
|
||
try:
|
||
# Change <nyt_headline> to <h2>
|
||
h1 = soup.find('h1')
|
||
blogheadline = str(h1) # added for dealbook
|
||
if h1:
|
||
headline = h1.find("nyt_headline")
|
||
if headline:
|
||
tag = Tag(soup, "h2")
|
||
tag['class'] = "headline"
|
||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||
h1.replaceWith(tag)
|
||
elif blogheadline.find('entry-title'): # added for dealbook
|
||
tag = Tag(soup, "h2") # added for dealbook
|
||
tag['class'] = "headline" # added for dealbook
|
||
tag.insert(0, self.fixChars(h1.contents[0])) # added for dealbook
|
||
h1.replaceWith(tag) # added for dealbook
|
||
|
||
else:
|
||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||
headline = soup.find('title')
|
||
if headline:
|
||
tag = Tag(soup, "h2")
|
||
tag['class'] = "headline"
|
||
tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
|
||
soup.insert(0, tag)
|
||
hrs = soup.findAll('hr')
|
||
for hr in hrs:
|
||
hr.extract()
|
||
except:
|
||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||
|
||
try:
|
||
# if this is from a blog (dealbook, fix the byline format
|
||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||
if bylineauthor:
|
||
tag = Tag(soup, "h6")
|
||
tag['class'] = "byline"
|
||
tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
|
||
bylineauthor.replaceWith(tag)
|
||
except:
|
||
self.log("ERROR: fixing byline author format")
|
||
|
||
try:
|
||
# if this is a blog (dealbook) fix the credit style for the pictures
|
||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||
if blogcredit:
|
||
tag = Tag(soup, "h6")
|
||
tag['class'] = "credit"
|
||
tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
|
||
blogcredit.replaceWith(tag)
|
||
except:
|
||
self.log("ERROR: fixing credit format")
|
||
|
||
try:
|
||
# Change <h1> to <h3> - used in editorial blogs
|
||
masthead = soup.find("h1")
|
||
if masthead:
|
||
# Nuke the href
|
||
if masthead.a:
|
||
del(masthead.a['href'])
|
||
tag = Tag(soup, "h3")
|
||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||
masthead.replaceWith(tag)
|
||
except:
|
||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||
|
||
try:
|
||
# Change <span class="bold"> to <b>
|
||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||
if subhead.contents:
|
||
bTag = Tag(soup, "b")
|
||
bTag.insert(0, subhead.contents[0])
|
||
subhead.replaceWith(bTag)
|
||
except:
|
||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||
try:
|
||
# remove the <strong> update tag
|
||
blogupdated = soup.find('span', {'class':'update'})
|
||
if blogupdated:
|
||
blogupdated.replaceWith("")
|
||
except:
|
||
self.log("ERROR: Removing strong tag")
|
||
|
||
try:
|
||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||
if divTag:
|
||
divTag['class'] = divTag['id']
|
||
except:
|
||
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||
|
||
try:
|
||
# Add class="authorId" to <div> so we can format with CSS
|
||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||
if divTag and divTag.contents[0]:
|
||
tag = Tag(soup, "p")
|
||
tag['class'] = "authorId"
|
||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||
use_alt=False)))
|
||
divTag.replaceWith(tag)
|
||
except:
|
||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||
#print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title))
|
||
return soup
|
||
|
||
def populate_article_metadata(self, article, soup, first):
|
||
if not first:
|
||
return
|
||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||
if idxdiv is not None:
|
||
if idxdiv.img:
|
||
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src']))
|
||
else:
|
||
img = soup.find('body').find('img')
|
||
if img is not None:
|
||
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src']))
|
||
shortparagraph = ""
|
||
try:
|
||
if len(article.text_summary.strip()) == 0:
|
||
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||
if articlebodies:
|
||
for articlebody in articlebodies:
|
||
if articlebody:
|
||
paras = articlebody.findAll('p')
|
||
for p in paras:
|
||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||
# account for blank paragraphs and short paragraphs by appending them to longer ones
|
||
if len(refparagraph) > 0:
|
||
if len(refparagraph) > 70: # approximately one line of text
|
||
newpara = shortparagraph + refparagraph
|
||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||
if newparaEm == '':
|
||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||
if newparaEm == '':
|
||
newparaDesc = newparaDateline
|
||
article.summary = article.text_summary = newparaDesc.strip()
|
||
return
|
||
else:
|
||
shortparagraph = refparagraph + " "
|
||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||
shortparagraph = shortparagraph + "- "
|
||
else:
|
||
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||
except:
|
||
self.log("Error creating article descriptions")
|
||
return
|
||
|