- used in editorial blogs masthead = soup.find("h1") if masthead: # Nuke the href if masthead.a: del(masthead.a['href']) tag = Tag(soup, "h3") tag.insert(0, self.fixChars(masthead.contents[0])) masthead.replaceWith(tag) except: self.log( "ERROR: Problem in Change

- used in editorial blogs") try: # Change to for subhead in soup.findAll(True, {'class': 'bold'}): if subhead.contents: bTag = Tag(soup, "b") bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) except: self.log( "ERROR: Problem in Change
to
- used in editorial blogs") try: # remove the update tag blogupdated = soup.find('span', {'class': 'update'}) if blogupdated: blogupdated.replaceWith("") except: self.log("ERROR: Removing strong tag") try: divTag = soup.find('div', attrs={'id': 'articleBody'}) if divTag: divTag['class'] = divTag['id'] except: self.log( "ERROR: Problem in soup.find(div,attrs={id:articleBody})") try: # Add class="authorId" to
so we can format with CSS divTag = soup.find('div', attrs={'id': 'authorId'}) if divTag and divTag.contents[0]: tag = Tag(soup, "p") tag['class'] = "authorId" tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], use_alt=False))) divTag.replaceWith(tag) except: self.log( "ERROR: Problem in Add class=authorId to
so we can format with CSS") return soup def populate_article_metadata(self, article, soup, first): if not first: return idxdiv = soup.find('div', attrs={'class': 'articleSpanImage'}) if idxdiv is not None: if idxdiv.img: self.add_toc_thumbnail(article, re.sub( r'links\\link\d+\\', '', idxdiv.img['src'])) else: img = soup.find('body').find('img') if img is not None: self.add_toc_thumbnail(article, re.sub( r'links\\link\d+\\', '', img['src'])) shortparagraph = "" try: if len(article.text_summary.strip()) == 0: articlebodies = soup.findAll( 'div', attrs={'class': 'articleBody'}) if articlebodies: for articlebody in articlebodies: if articlebody: paras = articlebody.findAll('p') for p in paras: refparagraph = self.massageNCXText( self.tag_to_string(p, use_alt=False)).strip() # account for blank paragraphs and short # paragraphs by appending them to longer ones if len(refparagraph) > 0: if len(refparagraph) > 70: # approximately one line of text newpara = shortparagraph + refparagraph newparaDateline, newparaEm, newparaDesc = newpara.partition( '—') if newparaEm == '': newparaDateline, newparaEm, newparaDesc = newpara.partition( '—') if newparaEm == '': newparaDesc = newparaDateline article.summary = article.text_summary = newparaDesc.strip() return else: shortparagraph = refparagraph + " " if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): shortparagraph = shortparagraph + "- " else: article.summary = article.text_summary = self.massageNCXText( article.text_summary) except: self.log("Error creating article descriptions") return