Update New York Times

This commit is contained in:
Kovid Goyal 2012-12-30 08:30:33 +05:30
parent 0e23b98274
commit 28a126709d
2 changed files with 193 additions and 219 deletions

View File

@ -153,7 +153,7 @@ class NYTimes(BasicNewsRecipe):
timefmt = '' timefmt = ''
simultaneous_downloads = 1 #simultaneous_downloads = 1 # no longer required to deal with ads
cover_margins = (18,18,'grey99') cover_margins = (18,18,'grey99')
@ -204,7 +204,8 @@ class NYTimes(BasicNewsRecipe):
re.compile('^subNavigation'), re.compile('^subNavigation'),
re.compile('^leaderboard'), re.compile('^leaderboard'),
re.compile('^module'), re.compile('^module'),
re.compile('commentCount') re.compile('commentCount'),
'credit'
]}), ]}),
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
@ -291,11 +292,11 @@ class NYTimes(BasicNewsRecipe):
del ans[idx] del ans[idx]
idx_max = idx_max-1 idx_max = idx_max-1
continue continue
if self.verbose: if True: #self.verbose
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]: for article in ans[idx][1]:
total_article_count += 1 total_article_count += 1
if self.verbose: if True: #self.verbose
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace'))) article['url'].encode('cp1252','replace')))
idx = idx+1 idx = idx+1
@ -351,23 +352,8 @@ class NYTimes(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
return br return br
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
##
## def skip_ad_pages(self, soup):
## # Skip ad pages served before actual article
## skip_tag = soup.find(True, {'name':'skip'})
## if skip_tag is not None:
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
## url += '?pagewanted=all'
## self.log.warn("Skipping ad to article at '%s'" % url)
## return self.index_to_soup(url, raw=True)
cover_tag = 'NY_NYT' cover_tag = 'NY_NYT'
def get_cover_url(self): def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
daysback=1 daysback=1
@ -745,11 +731,12 @@ class NYTimes(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
print("PREPROCESS TITLE="+self.tag_to_string(soup.title)) #print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
skip_tag = soup.find(True, {'name':'skip'}) skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None: if skip_tag is not None:
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) #url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
url += '?pagewanted=all' url = 'http://www.nytimes.com' + skip_tag.parent['href']
#url += '?pagewanted=all'
self.log.warn("Skipping ad to article at '%s'" % url) self.log.warn("Skipping ad to article at '%s'" % url)
sleep(5) sleep(5)
soup = self.handle_tags(self.article_to_soup(url)) soup = self.handle_tags(self.article_to_soup(url))
@ -969,121 +956,121 @@ class NYTimes(BasicNewsRecipe):
self.log("ERROR: One picture per article in postprocess_html") self.log("ERROR: One picture per article in postprocess_html")
try: try:
# Change captions to italic # Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) : for caption in soup.findAll(True, {'class':'caption'}) :
if caption and len(caption) > 0: if caption and len(caption) > 0:
cTag = Tag(soup, "p", [("class", "caption")]) cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos") mp_off = c.find("More Photos")
if mp_off >= 0: if mp_off >= 0:
c = c[:mp_off] c = c[:mp_off]
cTag.insert(0, c) cTag.insert(0, c)
caption.replaceWith(cTag) caption.replaceWith(cTag)
except: except:
self.log("ERROR: Problem in change captions to italic") self.log("ERROR: Problem in change captions to italic")
try: try:
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
blogheadline = str(h1) #added for dealbook blogheadline = str(h1) #added for dealbook
if h1: if h1:
headline = h1.find("nyt_headline") headline = h1.find("nyt_headline")
if headline: if headline:
tag = Tag(soup, "h2") tag = Tag(soup, "h2")
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0])) tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag) h1.replaceWith(tag)
elif blogheadline.find('entry-title'):#added for dealbook elif blogheadline.find('entry-title'):#added for dealbook
tag = Tag(soup, "h2")#added for dealbook tag = Tag(soup, "h2")#added for dealbook
tag['class'] = "headline"#added for dealbook tag['class'] = "headline"#added for dealbook
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
h1.replaceWith(tag)#added for dealbook h1.replaceWith(tag)#added for dealbook
else: else:
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011 # Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
headline = soup.find('title') headline = soup.find('title')
if headline: if headline:
tag = Tag(soup, "h2") tag = Tag(soup, "h2")
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.renderContents())) tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
soup.insert(0, tag) soup.insert(0, tag)
hrs = soup.findAll('hr') hrs = soup.findAll('hr')
for hr in hrs: for hr in hrs:
hr.extract() hr.extract()
except: except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>") self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try: try:
#if this is from a blog (dealbook, fix the byline format #if this is from a blog (dealbook, fix the byline format
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
if bylineauthor: if bylineauthor:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
tag['class'] = "byline" tag['class'] = "byline"
tag.insert(0, self.fixChars(bylineauthor.renderContents())) tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
bylineauthor.replaceWith(tag) bylineauthor.replaceWith(tag)
except: except:
self.log("ERROR: fixing byline author format") self.log("ERROR: fixing byline author format")
try: try:
#if this is a blog (dealbook) fix the credit style for the pictures #if this is a blog (dealbook) fix the credit style for the pictures
blogcredit = soup.find('div',attrs={'class':'credit'}) blogcredit = soup.find('div',attrs={'class':'credit'})
if blogcredit: if blogcredit:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
tag['class'] = "credit" tag['class'] = "credit"
tag.insert(0, self.fixChars(blogcredit.renderContents())) tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
blogcredit.replaceWith(tag) blogcredit.replaceWith(tag)
except: except:
self.log("ERROR: fixing credit format") self.log("ERROR: fixing credit format")
try: try:
# Change <h1> to <h3> - used in editorial blogs # Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1") masthead = soup.find("h1")
if masthead: if masthead:
# Nuke the href # Nuke the href
if masthead.a: if masthead.a:
del(masthead.a['href']) del(masthead.a['href'])
tag = Tag(soup, "h3") tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0])) tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag) masthead.replaceWith(tag)
except: except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try: try:
# Change <span class="bold"> to <b> # Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) : for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents: if subhead.contents:
bTag = Tag(soup, "b") bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0]) bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
except: except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try: try:
#remove the <strong> update tag #remove the <strong> update tag
blogupdated = soup.find('span', {'class':'update'}) blogupdated = soup.find('span', {'class':'update'})
if blogupdated: if blogupdated:
blogupdated.replaceWith("") blogupdated.replaceWith("")
except: except:
self.log("ERROR: Removing strong tag") self.log("ERROR: Removing strong tag")
try: try:
divTag = soup.find('div',attrs={'id':'articleBody'}) divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag: if divTag:
divTag['class'] = divTag['id'] divTag['class'] = divTag['id']
except: except:
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
try: try:
# Add class="authorId" to <div> so we can format with CSS # Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'}) divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]: if divTag and divTag.contents[0]:
tag = Tag(soup, "p") tag = Tag(soup, "p")
tag['class'] = "authorId" tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False))) use_alt=False)))
divTag.replaceWith(tag) divTag.replaceWith(tag)
except: except:
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS") self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
return soup return soup

View File

@ -32,7 +32,7 @@ class NYTimes(BasicNewsRecipe):
# number of days old an article can be for inclusion. If oldest_web_article = None all articles # number of days old an article can be for inclusion. If oldest_web_article = None all articles
# will be included. Note: oldest_web_article is ignored if webEdition = False # will be included. Note: oldest_web_article is ignored if webEdition = False
webEdition = False webEdition = False
oldest_web_article = 7 oldest_web_article = None
# download higher resolution images than the small thumbnails typically included in the article # download higher resolution images than the small thumbnails typically included in the article
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
@ -153,7 +153,7 @@ class NYTimes(BasicNewsRecipe):
timefmt = '' timefmt = ''
simultaneous_downloads = 1 #simultaneous_downloads = 1 # no longer required to deal with ads
cover_margins = (18,18,'grey99') cover_margins = (18,18,'grey99')
@ -204,7 +204,8 @@ class NYTimes(BasicNewsRecipe):
re.compile('^subNavigation'), re.compile('^subNavigation'),
re.compile('^leaderboard'), re.compile('^leaderboard'),
re.compile('^module'), re.compile('^module'),
re.compile('commentCount') re.compile('commentCount'),
'credit'
]}), ]}),
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
@ -291,11 +292,11 @@ class NYTimes(BasicNewsRecipe):
del ans[idx] del ans[idx]
idx_max = idx_max-1 idx_max = idx_max-1
continue continue
if self.verbose: if True: #self.verbose
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]: for article in ans[idx][1]:
total_article_count += 1 total_article_count += 1
if self.verbose: if True: #self.verbose
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace'))) article['url'].encode('cp1252','replace')))
idx = idx+1 idx = idx+1
@ -351,23 +352,8 @@ class NYTimes(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
return br return br
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
##
## def skip_ad_pages(self, soup):
## # Skip ad pages served before actual article
## skip_tag = soup.find(True, {'name':'skip'})
## if skip_tag is not None:
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
## url += '?pagewanted=all'
## self.log.warn("Skipping ad to article at '%s'" % url)
## return self.index_to_soup(url, raw=True)
cover_tag = 'NY_NYT' cover_tag = 'NY_NYT'
def get_cover_url(self): def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
daysback=1 daysback=1
@ -745,11 +731,12 @@ class NYTimes(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
print("PREPROCESS TITLE="+self.tag_to_string(soup.title)) #print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
skip_tag = soup.find(True, {'name':'skip'}) skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None: if skip_tag is not None:
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) #url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
url += '?pagewanted=all' url = 'http://www.nytimes.com' + skip_tag.parent['href']
#url += '?pagewanted=all'
self.log.warn("Skipping ad to article at '%s'" % url) self.log.warn("Skipping ad to article at '%s'" % url)
sleep(5) sleep(5)
soup = self.handle_tags(self.article_to_soup(url)) soup = self.handle_tags(self.article_to_soup(url))
@ -969,121 +956,121 @@ class NYTimes(BasicNewsRecipe):
self.log("ERROR: One picture per article in postprocess_html") self.log("ERROR: One picture per article in postprocess_html")
try: try:
# Change captions to italic # Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) : for caption in soup.findAll(True, {'class':'caption'}) :
if caption and len(caption) > 0: if caption and len(caption) > 0:
cTag = Tag(soup, "p", [("class", "caption")]) cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos") mp_off = c.find("More Photos")
if mp_off >= 0: if mp_off >= 0:
c = c[:mp_off] c = c[:mp_off]
cTag.insert(0, c) cTag.insert(0, c)
caption.replaceWith(cTag) caption.replaceWith(cTag)
except: except:
self.log("ERROR: Problem in change captions to italic") self.log("ERROR: Problem in change captions to italic")
try: try:
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
blogheadline = str(h1) #added for dealbook blogheadline = str(h1) #added for dealbook
if h1: if h1:
headline = h1.find("nyt_headline") headline = h1.find("nyt_headline")
if headline: if headline:
tag = Tag(soup, "h2") tag = Tag(soup, "h2")
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0])) tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag) h1.replaceWith(tag)
elif blogheadline.find('entry-title'):#added for dealbook elif blogheadline.find('entry-title'):#added for dealbook
tag = Tag(soup, "h2")#added for dealbook tag = Tag(soup, "h2")#added for dealbook
tag['class'] = "headline"#added for dealbook tag['class'] = "headline"#added for dealbook
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
h1.replaceWith(tag)#added for dealbook h1.replaceWith(tag)#added for dealbook
else: else:
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011 # Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
headline = soup.find('title') headline = soup.find('title')
if headline: if headline:
tag = Tag(soup, "h2") tag = Tag(soup, "h2")
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.renderContents())) tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
soup.insert(0, tag) soup.insert(0, tag)
hrs = soup.findAll('hr') hrs = soup.findAll('hr')
for hr in hrs: for hr in hrs:
hr.extract() hr.extract()
except: except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>") self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try: try:
#if this is from a blog (dealbook, fix the byline format #if this is from a blog (dealbook, fix the byline format
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
if bylineauthor: if bylineauthor:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
tag['class'] = "byline" tag['class'] = "byline"
tag.insert(0, self.fixChars(bylineauthor.renderContents())) tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
bylineauthor.replaceWith(tag) bylineauthor.replaceWith(tag)
except: except:
self.log("ERROR: fixing byline author format") self.log("ERROR: fixing byline author format")
try: try:
#if this is a blog (dealbook) fix the credit style for the pictures #if this is a blog (dealbook) fix the credit style for the pictures
blogcredit = soup.find('div',attrs={'class':'credit'}) blogcredit = soup.find('div',attrs={'class':'credit'})
if blogcredit: if blogcredit:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
tag['class'] = "credit" tag['class'] = "credit"
tag.insert(0, self.fixChars(blogcredit.renderContents())) tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
blogcredit.replaceWith(tag) blogcredit.replaceWith(tag)
except: except:
self.log("ERROR: fixing credit format") self.log("ERROR: fixing credit format")
try: try:
# Change <h1> to <h3> - used in editorial blogs # Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1") masthead = soup.find("h1")
if masthead: if masthead:
# Nuke the href # Nuke the href
if masthead.a: if masthead.a:
del(masthead.a['href']) del(masthead.a['href'])
tag = Tag(soup, "h3") tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0])) tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag) masthead.replaceWith(tag)
except: except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try: try:
# Change <span class="bold"> to <b> # Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) : for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents: if subhead.contents:
bTag = Tag(soup, "b") bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0]) bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
except: except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try: try:
#remove the <strong> update tag #remove the <strong> update tag
blogupdated = soup.find('span', {'class':'update'}) blogupdated = soup.find('span', {'class':'update'})
if blogupdated: if blogupdated:
blogupdated.replaceWith("") blogupdated.replaceWith("")
except: except:
self.log("ERROR: Removing strong tag") self.log("ERROR: Removing strong tag")
try: try:
divTag = soup.find('div',attrs={'id':'articleBody'}) divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag: if divTag:
divTag['class'] = divTag['id'] divTag['class'] = divTag['id']
except: except:
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
try: try:
# Add class="authorId" to <div> so we can format with CSS # Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'}) divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]: if divTag and divTag.contents[0]:
tag = Tag(soup, "p") tag = Tag(soup, "p")
tag['class'] = "authorId" tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False))) use_alt=False)))
divTag.replaceWith(tag) divTag.replaceWith(tag)
except: except:
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS") self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
return soup return soup