- used in editorial blogs masthead = soup.find("h1") if masthead: # Nuke the href if masthead.a: del(masthead.a['href']) tag = Tag(soup, "h3") tag.insert(0, self.fixChars(masthead.contents[0])) masthead.replaceWith(tag) except: self.log("ERROR: Problem in Change

- used in editorial blogs") try: # Change to for subhead in soup.findAll(True, {'class':'bold'}) : if subhead.contents: bTag = Tag(soup, "b") bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) except: self.log("ERROR: Problem in Change
to
- used in editorial blogs") try: #remove the update tag blogupdated = soup.find('span', {'class':'update'}) if blogupdated: blogupdated.replaceWith("") except: self.log("ERROR: Removing strong tag") try: divTag = soup.find('div',attrs={'id':'articleBody'}) if divTag: divTag['class'] = divTag['id'] except: self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") try: # Add class="authorId" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'authorId'}) if divTag and divTag.contents[0]: tag = Tag(soup, "p") tag['class'] = "authorId" tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], use_alt=False))) divTag.replaceWith(tag) except: self.log("ERROR: Problem in Add class=authorId to
so we can format with CSS") #print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title)) return soup def populate_article_metadata(self, article, soup, first): if not first: return idxdiv = soup.find('div',attrs={'class':'articleSpanImage'}) if idxdiv is not None: if idxdiv.img: self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src'])) else: img = soup.find('body').find('img') if img is not None: self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src'])) shortparagraph = "" try: if len(article.text_summary.strip()) == 0: articlebodies = soup.findAll('div',attrs={'class':'articleBody'}) if articlebodies: for articlebody in articlebodies: if articlebody: paras = articlebody.findAll('p') for p in paras: refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() #account for blank paragraphs and short paragraphs by appending them to longer ones if len(refparagraph) > 0: if len(refparagraph) > 70: #approximately one line of text newpara = shortparagraph + refparagraph newparaDateline,newparaEm,newparaDesc = newpara.partition('—') if newparaEm == '': newparaDateline,newparaEm,newparaDesc = newpara.partition('—') if newparaEm == '': newparaDesc = newparaDateline article.summary = article.text_summary = newparaDesc.strip() return else: shortparagraph = refparagraph + " " if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): shortparagraph = shortparagraph + "- " else: article.summary = article.text_summary = self.massageNCXText(article.text_summary) except: self.log("Error creating article descriptions") return