mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Update New York Times
This commit is contained in:
parent
0e23b98274
commit
28a126709d
@ -153,7 +153,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
simultaneous_downloads = 1
|
#simultaneous_downloads = 1 # no longer required to deal with ads
|
||||||
|
|
||||||
cover_margins = (18,18,'grey99')
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
@ -204,7 +204,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
re.compile('^leaderboard'),
|
re.compile('^leaderboard'),
|
||||||
re.compile('^module'),
|
re.compile('^module'),
|
||||||
re.compile('commentCount')
|
re.compile('commentCount'),
|
||||||
|
'credit'
|
||||||
]}),
|
]}),
|
||||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||||
@ -291,11 +292,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
del ans[idx]
|
del ans[idx]
|
||||||
idx_max = idx_max-1
|
idx_max = idx_max-1
|
||||||
continue
|
continue
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
for article in ans[idx][1]:
|
for article in ans[idx][1]:
|
||||||
total_article_count += 1
|
total_article_count += 1
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
article['url'].encode('cp1252','replace')))
|
article['url'].encode('cp1252','replace')))
|
||||||
idx = idx+1
|
idx = idx+1
|
||||||
@ -351,23 +352,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
|
|
||||||
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
|
|
||||||
##
|
|
||||||
## def skip_ad_pages(self, soup):
|
|
||||||
## # Skip ad pages served before actual article
|
|
||||||
## skip_tag = soup.find(True, {'name':'skip'})
|
|
||||||
## if skip_tag is not None:
|
|
||||||
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
|
||||||
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
|
||||||
## url += '?pagewanted=all'
|
|
||||||
## self.log.warn("Skipping ad to article at '%s'" % url)
|
|
||||||
## return self.index_to_soup(url, raw=True)
|
|
||||||
|
|
||||||
|
|
||||||
cover_tag = 'NY_NYT'
|
cover_tag = 'NY_NYT'
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
from datetime import timedelta, date
|
|
||||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
daysback=1
|
daysback=1
|
||||||
@ -745,11 +731,12 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
#print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
if skip_tag is not None:
|
if skip_tag is not None:
|
||||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
#url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||||
url += '?pagewanted=all'
|
url = 'http://www.nytimes.com' + skip_tag.parent['href']
|
||||||
|
#url += '?pagewanted=all'
|
||||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||||
sleep(5)
|
sleep(5)
|
||||||
soup = self.handle_tags(self.article_to_soup(url))
|
soup = self.handle_tags(self.article_to_soup(url))
|
||||||
@ -969,121 +956,121 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("ERROR: One picture per article in postprocess_html")
|
self.log("ERROR: One picture per article in postprocess_html")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change captions to italic
|
# Change captions to italic
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
if caption and len(caption) > 0:
|
if caption and len(caption) > 0:
|
||||||
cTag = Tag(soup, "p", [("class", "caption")])
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
mp_off = c.find("More Photos")
|
mp_off = c.find("More Photos")
|
||||||
if mp_off >= 0:
|
if mp_off >= 0:
|
||||||
c = c[:mp_off]
|
c = c[:mp_off]
|
||||||
cTag.insert(0, c)
|
cTag.insert(0, c)
|
||||||
caption.replaceWith(cTag)
|
caption.replaceWith(cTag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in change captions to italic")
|
self.log("ERROR: Problem in change captions to italic")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
blogheadline = str(h1) #added for dealbook
|
blogheadline = str(h1) #added for dealbook
|
||||||
if h1:
|
if h1:
|
||||||
headline = h1.find("nyt_headline")
|
headline = h1.find("nyt_headline")
|
||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
h1.replaceWith(tag)
|
h1.replaceWith(tag)
|
||||||
elif blogheadline.find('entry-title'):#added for dealbook
|
elif blogheadline.find('entry-title'):#added for dealbook
|
||||||
tag = Tag(soup, "h2")#added for dealbook
|
tag = Tag(soup, "h2")#added for dealbook
|
||||||
tag['class'] = "headline"#added for dealbook
|
tag['class'] = "headline"#added for dealbook
|
||||||
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||||
h1.replaceWith(tag)#added for dealbook
|
h1.replaceWith(tag)#added for dealbook
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||||
headline = soup.find('title')
|
headline = soup.find('title')
|
||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
|
||||||
soup.insert(0, tag)
|
soup.insert(0, tag)
|
||||||
hrs = soup.findAll('hr')
|
hrs = soup.findAll('hr')
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
hr.extract()
|
hr.extract()
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is from a blog (dealbook, fix the byline format
|
#if this is from a blog (dealbook, fix the byline format
|
||||||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||||
if bylineauthor:
|
if bylineauthor:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
tag['class'] = "byline"
|
tag['class'] = "byline"
|
||||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
|
||||||
bylineauthor.replaceWith(tag)
|
bylineauthor.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing byline author format")
|
self.log("ERROR: fixing byline author format")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is a blog (dealbook) fix the credit style for the pictures
|
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||||
if blogcredit:
|
if blogcredit:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
tag['class'] = "credit"
|
tag['class'] = "credit"
|
||||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
|
||||||
blogcredit.replaceWith(tag)
|
blogcredit.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing credit format")
|
self.log("ERROR: fixing credit format")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
if masthead:
|
if masthead:
|
||||||
# Nuke the href
|
# Nuke the href
|
||||||
if masthead.a:
|
if masthead.a:
|
||||||
del(masthead.a['href'])
|
del(masthead.a['href'])
|
||||||
tag = Tag(soup, "h3")
|
tag = Tag(soup, "h3")
|
||||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
masthead.replaceWith(tag)
|
masthead.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <span class="bold"> to <b>
|
# Change <span class="bold"> to <b>
|
||||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
if subhead.contents:
|
if subhead.contents:
|
||||||
bTag = Tag(soup, "b")
|
bTag = Tag(soup, "b")
|
||||||
bTag.insert(0, subhead.contents[0])
|
bTag.insert(0, subhead.contents[0])
|
||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
try:
|
try:
|
||||||
#remove the <strong> update tag
|
#remove the <strong> update tag
|
||||||
blogupdated = soup.find('span', {'class':'update'})
|
blogupdated = soup.find('span', {'class':'update'})
|
||||||
if blogupdated:
|
if blogupdated:
|
||||||
blogupdated.replaceWith("")
|
blogupdated.replaceWith("")
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Removing strong tag")
|
self.log("ERROR: Removing strong tag")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
if divTag:
|
if divTag:
|
||||||
divTag['class'] = divTag['id']
|
divTag['class'] = divTag['id']
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
if divTag and divTag.contents[0]:
|
if divTag and divTag.contents[0]:
|
||||||
tag = Tag(soup, "p")
|
tag = Tag(soup, "p")
|
||||||
tag['class'] = "authorId"
|
tag['class'] = "authorId"
|
||||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
use_alt=False)))
|
use_alt=False)))
|
||||||
divTag.replaceWith(tag)
|
divTag.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
||||||
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
||||||
webEdition = False
|
webEdition = False
|
||||||
oldest_web_article = 7
|
oldest_web_article = None
|
||||||
|
|
||||||
# download higher resolution images than the small thumbnails typically included in the article
|
# download higher resolution images than the small thumbnails typically included in the article
|
||||||
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
||||||
@ -153,7 +153,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
simultaneous_downloads = 1
|
#simultaneous_downloads = 1 # no longer required to deal with ads
|
||||||
|
|
||||||
cover_margins = (18,18,'grey99')
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
@ -204,7 +204,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
re.compile('^leaderboard'),
|
re.compile('^leaderboard'),
|
||||||
re.compile('^module'),
|
re.compile('^module'),
|
||||||
re.compile('commentCount')
|
re.compile('commentCount'),
|
||||||
|
'credit'
|
||||||
]}),
|
]}),
|
||||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||||
@ -291,11 +292,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
del ans[idx]
|
del ans[idx]
|
||||||
idx_max = idx_max-1
|
idx_max = idx_max-1
|
||||||
continue
|
continue
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
for article in ans[idx][1]:
|
for article in ans[idx][1]:
|
||||||
total_article_count += 1
|
total_article_count += 1
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
article['url'].encode('cp1252','replace')))
|
article['url'].encode('cp1252','replace')))
|
||||||
idx = idx+1
|
idx = idx+1
|
||||||
@ -351,23 +352,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
|
|
||||||
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
|
|
||||||
##
|
|
||||||
## def skip_ad_pages(self, soup):
|
|
||||||
## # Skip ad pages served before actual article
|
|
||||||
## skip_tag = soup.find(True, {'name':'skip'})
|
|
||||||
## if skip_tag is not None:
|
|
||||||
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
|
||||||
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
|
||||||
## url += '?pagewanted=all'
|
|
||||||
## self.log.warn("Skipping ad to article at '%s'" % url)
|
|
||||||
## return self.index_to_soup(url, raw=True)
|
|
||||||
|
|
||||||
|
|
||||||
cover_tag = 'NY_NYT'
|
cover_tag = 'NY_NYT'
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
from datetime import timedelta, date
|
|
||||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
daysback=1
|
daysback=1
|
||||||
@ -745,11 +731,12 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
#print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
if skip_tag is not None:
|
if skip_tag is not None:
|
||||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
#url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||||
url += '?pagewanted=all'
|
url = 'http://www.nytimes.com' + skip_tag.parent['href']
|
||||||
|
#url += '?pagewanted=all'
|
||||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||||
sleep(5)
|
sleep(5)
|
||||||
soup = self.handle_tags(self.article_to_soup(url))
|
soup = self.handle_tags(self.article_to_soup(url))
|
||||||
@ -969,121 +956,121 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("ERROR: One picture per article in postprocess_html")
|
self.log("ERROR: One picture per article in postprocess_html")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change captions to italic
|
# Change captions to italic
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
if caption and len(caption) > 0:
|
if caption and len(caption) > 0:
|
||||||
cTag = Tag(soup, "p", [("class", "caption")])
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
mp_off = c.find("More Photos")
|
mp_off = c.find("More Photos")
|
||||||
if mp_off >= 0:
|
if mp_off >= 0:
|
||||||
c = c[:mp_off]
|
c = c[:mp_off]
|
||||||
cTag.insert(0, c)
|
cTag.insert(0, c)
|
||||||
caption.replaceWith(cTag)
|
caption.replaceWith(cTag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in change captions to italic")
|
self.log("ERROR: Problem in change captions to italic")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
blogheadline = str(h1) #added for dealbook
|
blogheadline = str(h1) #added for dealbook
|
||||||
if h1:
|
if h1:
|
||||||
headline = h1.find("nyt_headline")
|
headline = h1.find("nyt_headline")
|
||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
h1.replaceWith(tag)
|
h1.replaceWith(tag)
|
||||||
elif blogheadline.find('entry-title'):#added for dealbook
|
elif blogheadline.find('entry-title'):#added for dealbook
|
||||||
tag = Tag(soup, "h2")#added for dealbook
|
tag = Tag(soup, "h2")#added for dealbook
|
||||||
tag['class'] = "headline"#added for dealbook
|
tag['class'] = "headline"#added for dealbook
|
||||||
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||||
h1.replaceWith(tag)#added for dealbook
|
h1.replaceWith(tag)#added for dealbook
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||||
headline = soup.find('title')
|
headline = soup.find('title')
|
||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
|
||||||
soup.insert(0, tag)
|
soup.insert(0, tag)
|
||||||
hrs = soup.findAll('hr')
|
hrs = soup.findAll('hr')
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
hr.extract()
|
hr.extract()
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is from a blog (dealbook, fix the byline format
|
#if this is from a blog (dealbook, fix the byline format
|
||||||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||||
if bylineauthor:
|
if bylineauthor:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
tag['class'] = "byline"
|
tag['class'] = "byline"
|
||||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
|
||||||
bylineauthor.replaceWith(tag)
|
bylineauthor.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing byline author format")
|
self.log("ERROR: fixing byline author format")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is a blog (dealbook) fix the credit style for the pictures
|
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||||
if blogcredit:
|
if blogcredit:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
tag['class'] = "credit"
|
tag['class'] = "credit"
|
||||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
|
||||||
blogcredit.replaceWith(tag)
|
blogcredit.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing credit format")
|
self.log("ERROR: fixing credit format")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
if masthead:
|
if masthead:
|
||||||
# Nuke the href
|
# Nuke the href
|
||||||
if masthead.a:
|
if masthead.a:
|
||||||
del(masthead.a['href'])
|
del(masthead.a['href'])
|
||||||
tag = Tag(soup, "h3")
|
tag = Tag(soup, "h3")
|
||||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
masthead.replaceWith(tag)
|
masthead.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <span class="bold"> to <b>
|
# Change <span class="bold"> to <b>
|
||||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
if subhead.contents:
|
if subhead.contents:
|
||||||
bTag = Tag(soup, "b")
|
bTag = Tag(soup, "b")
|
||||||
bTag.insert(0, subhead.contents[0])
|
bTag.insert(0, subhead.contents[0])
|
||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
try:
|
try:
|
||||||
#remove the <strong> update tag
|
#remove the <strong> update tag
|
||||||
blogupdated = soup.find('span', {'class':'update'})
|
blogupdated = soup.find('span', {'class':'update'})
|
||||||
if blogupdated:
|
if blogupdated:
|
||||||
blogupdated.replaceWith("")
|
blogupdated.replaceWith("")
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Removing strong tag")
|
self.log("ERROR: Removing strong tag")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
if divTag:
|
if divTag:
|
||||||
divTag['class'] = divTag['id']
|
divTag['class'] = divTag['id']
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
if divTag and divTag.contents[0]:
|
if divTag and divTag.contents[0]:
|
||||||
tag = Tag(soup, "p")
|
tag = Tag(soup, "p")
|
||||||
tag['class'] = "authorId"
|
tag['class'] = "authorId"
|
||||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
use_alt=False)))
|
use_alt=False)))
|
||||||
divTag.replaceWith(tag)
|
divTag.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user