diff --git a/resources/content_server/browse/browse.html b/resources/content_server/browse/browse.html index 4acc15f3ea..e216b68ba8 100644 --- a/resources/content_server/browse/browse.html +++ b/resources/content_server/browse/browse.html @@ -58,7 +58,7 @@ diff --git a/resources/content_server/button-donate.png b/resources/content_server/button-donate.png new file mode 100644 index 0000000000..25ccf3f514 Binary files /dev/null and b/resources/content_server/button-donate.png differ diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 270b7e0b06..0f570bab40 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -203,3 +203,11 @@ content_server_wont_display = [''] # level sorts, and if you are seeing a slowdown, reduce the value of this tweak. maximum_resort_levels = 5 +# Absolute path to a TTF font file to use as the font for the title and author +# when generating a default cover. Useful if the default font (Liberation +# Serif) does not contain glyphs for the language of the books in your library. +generate_cover_title_font = None + +# Absolute path to a TTF font file to use as the font for the footer in the +# default cover +generate_cover_foot_font = None diff --git a/resources/recipes/cubadebate.recipe b/resources/recipes/cubadebate.recipe index 88d06d412d..f8887b2672 100644 --- a/resources/recipes/cubadebate.recipe +++ b/resources/recipes/cubadebate.recipe @@ -1,9 +1,7 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2010, Darko Miletic ' ''' -newyorker.com +cubadebate.cu ''' from calibre.web.feeds.news import BasicNewsRecipe @@ -13,32 +11,44 @@ class CubaDebate(BasicNewsRecipe): __author__ = 'Darko Miletic' description = 'Contra el Terorismo Mediatico' oldest_article = 15 - language = 'es' - + language = 'es' max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False publisher = 'Cubadebate' category = 'news, politics, Cuba' encoding = 'utf-8' - extra_css = ' #BlogTitle{font-size: x-large; font-weight: bold} ' + masthead_url = 'http://www.cubadebate.cu/wp-content/themes/cubadebate/images/logo.gif' + publication_type = 'newsportal' + extra_css = """ + #BlogTitle{font-size: xx-large; font-weight: bold} + body{font-family: Verdana, Arial, Tahoma, sans-serif} + """ conversion_options = { 'comments' : description ,'tags' : category - ,'language' : 'es' + ,'language' : language ,'publisher' : publisher - ,'pretty_print': True } keep_only_tags = [dict(name='div', attrs={'id':'Outline'})] remove_tags_after = dict(name='div',attrs={'id':'BlogContent'}) - remove_tags = [dict(name='link')] + remove_tags = [ + dict(name=['link','base','embed','object','meta','iframe']) + ,dict(attrs={'id':'addthis_container'}) + ] feeds = [(u'Articulos', u'http://www.cubadebate.cu/feed/')] - + remove_attributes=['width','height','lang'] + def print_version(self, url): return url + 'print/' def preprocess_html(self, soup): - return self.adeify_images(soup) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe new file mode 100644 index 0000000000..6a61405698 --- /dev/null +++ b/resources/recipes/ming_pao.recipe @@ -0,0 +1,64 @@ +cense__ = 'GPL v3' +__copyright__ = '2010, Eddie Lau' +''' +modified from Singtao Toronto calibre recipe by rty +''' + +import datetime +from calibre.web.feeds.recipes import BasicNewsRecipe + +class AdvancedUserRecipe1278063072(BasicNewsRecipe): + title = 'Ming Pao - Hong Kong' + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Eddie Lau' + description = 'Hong Kong Chinese Newspaper' + publisher = 'news.mingpao.com' + category = 'Chinese, News, Hong Kong' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'zh' + encoding = 'Big5-HKSCS' + recursions = 0 + conversion_options = {'linearize_tables':True} + masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' + + keep_only_tags = [dict(name='h1'), + dict(attrs={'id':['newscontent01','newscontent02']})] + + def get_fetchdate(self): + dt_utc = datetime.datetime.utcnow() + # convert UTC to local hk time + dt_local = dt_utc - datetime.timedelta(-8.0/24) + return dt_local.strftime("%Y%m%d") + + def parse_index(self): + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + return feeds + + def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet']}) + current_articles = [] + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + current_articles.append({'title': title, 'url': url, 'description':''}) + return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(width=True): + del item['width'] + return soup + diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 1814132667..5452ae1c6e 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -4,149 +4,79 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com -V5 - One picture per article, moved to top: -Headline -Image -Byline -Story ''' -import re, string, time +import string, re, time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +def decode(self, src): + enc = 'utf-8' + if 'iso-8859-1' in src: + enc = 'cp1252' + return src.decode(enc, 'ignore') class NYTimes(BasicNewsRecipe): - title = 'The New York Times' - __author__ = 'GRiker' + title = u'New York Times' + __author__ = 'Kovid Goyal/Nick Redding' language = 'en' - requires_version = (0, 7, 5) + requires_version = (0, 6, 36) description = 'Daily news from the New York Times (subscription version)' - allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials', - 'New York','Business Day','Science Times','Sports','Dining','Arts', - 'Home','Styles','Sunday Business','Week In Review','Travel','Magazine', - 'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion", - "T Women's Fashion"] - - # List of sections to exclude - # To add a section, copy the section name from the allSectionKeywords list above - # For example, to exclude 'Dining' and 'Weddings': - #excludeSectionKeywords = ['Dining','Weddings'] - excludeSectionKeywords = [] - - # List of sections to include (test and debug only) - # By default, any sections in today's paper that are not listed in excludeSectionKeywords - # are downloaded. fetch_only specifies that only certain sections are to be downloaded. - # This should only be used for testing and debugging. - # For example, to download only 'The Front Page' section: - # fetch_only = set(['The Front Page']) - fetch_only = set([]) - if fetch_only: - excludeSectionKeywords = list(set(allSectionKeywords) ^ fetch_only) - - # one_picture_per_article specifies that calibre should only use the first image - # from an article (if one exists). If one_picture_per_article = True, the image - # will be moved to a location between the headline and the byline. - # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = True - - timefmt = '' + timefmt = ' [%b %d]' needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':[ - 'articleFooter', - 'articleTools', - 'columnGroup doubleRule', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - 'icon enlargeThis', - 'leftNavTabs', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - 'subNavigation clearfix', - 'subNavigation tabContent active', - 'subNavigation tabContent active clearfix', - ]}), - dict(id=[ - 'adxLeaderboard', - 'archive', - 'articleExtras', - 'articleInline', - 'blog_sidebar', - 'businessSearchBar', - 'cCol', - 'entertainmentSearchBar', - 'footer', - 'header', - 'header_search', - 'login', - 'masthead', - 'masthead-nav', - 'memberTools', - 'navigation', - 'portfolioInline', - 'relatedArticles', - 'respond', - 'side_search', - 'side_index', - 'side_tool', - 'toolsRight', - ]), - dict(name=['script', 'noscript', 'style'])] - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - cover_margins = (18,18,'grey99') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', + 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', + 'icon enlargeThis','columnGroup last','relatedSearchesModule']}), + dict({'class':re.compile('^subNavigation')}), + dict({'class':re.compile('^leaderboard')}), + dict({'class':re.compile('^module')}), + dict({'class':'metaFootnote'}), + dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', + 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', + 'side_tool', 'side_index','header','readerReviewsCount','readerReviews', + 'relatedArticles', 'relatedTopics', 'adxSponLink']), + dict(name=['script', 'noscript', 'style','form','hr'])] + encoding = decode no_stylesheets = True - extra_css = '.headline {text-align: left;}\n \ - .byline {font-family: monospace; \ - text-align: left; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .dateline {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .timestamp {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .source {text-align: left;}\n \ - .image {text-align: center;}\n \ - .credit {text-align: right; \ - font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .articleBody {text-align: left;}\n \ - .authorId {text-align: left; \ - font-style: italic;}\n ' + extra_css = ''' + .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } + .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { font-size: small; } + .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; }''' def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - try: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - raw = br.submit().read() - if 'Sorry, we could not find the combination you entered. Please try again.' in raw: - raise Exception('Your username and password are incorrect') - #open('/t/log.html', 'wb').write(raw) - except: - self.log("\nFailed to login") - + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + raw = br.submit().read() + if 'Sorry, we could not find the combination you entered. Please try again.' in raw: + raise Exception('Your username and password are incorrect') + #open('/t/log.html', 'wb').write(raw) return br + def get_masthead_url(self): + masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + #masthead = 'http://members.cox.net/nickredding/nytlogo.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nMasthead unavailable") + masthead = None + return masthead + + def get_cover_url(self): cover = None st = time.localtime() @@ -162,316 +92,101 @@ class NYTimes(BasicNewsRecipe): cover = None return cover - def get_masthead_title(self): - return self.title - - def dump_ans(self, ans): - total_article_count = 0 - for section in ans : - if self.verbose: - self.log("section %s: %d articles" % (section[0], len(section[1])) ) - for article in section[1]: - total_article_count += 1 - if self.verbose: - self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'), - article['url'].encode('mac-roman','replace'))) - self.log( "Queued %d articles" % total_article_count ) - - def dump_hex(self, src, length=16): - ''' Diagnostic ''' - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) - N=0; result='' - while src: - s,src = src[:length],src[length:] - hexa = ' '.join(["%02X"%ord(x) for x in s]) - s = s.translate(FILTER) - result += "%04X %-*s %s\n" % (N, length*3, hexa, s) - N+=length - print result - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - - return fixed - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description + def short_title(self): + return 'New York Times' def parse_index(self): + self.encoding = 'cp1252' soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') + self.encoding = decode def feed_title(div): - return ''.join(div.findAll(text=True, recursive=False)).strip() + return ''.join(div.findAll(text=True, recursive=True)).strip() articles = {} key = None ans = [] - # Find each instance of class="section-headline", class="story", class="story headline" - for div in soup.findAll(True, - attrs={'class':['section-headline', 'story', 'story headline']}): + url_list = [] - if div['class'] == 'section-headline': - key = string.capwords(feed_title(div)) - if self.excludeSectionKeywords: - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key): - self.log("Skipping section %s" % key) - continue - articles[key] = [] - ans.append(key) - - elif div['class'] in ['story', 'story headline'] : - a = div.find('a', href=True) - if not a: - continue - url = re.sub(r'\?.*', '', a['href']) - url += '?pagewanted=all' - - title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip()) - - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.massageNCXText(self.tag_to_string(summary, use_alt=False)) - - author = '' - authorAttribution = div.find(True, attrs={'class':'storyheadline-author'}) + def handle_article(div): + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + return + if not url.endswith(".html"): + return + if 'podcast' in url: + return + url += '?pagewanted=all' + if url in url_list: + return + url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + #self.log("Title: %s" % title) + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: + authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class':'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - # Kill commas - Kindle switches to '&' - author = re.sub(',','',author) + feed = key if key is not None else 'Uncategorized' + if not articles.has_key(feed): + articles[feed] = [] + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) - feed = key if key is not None else 'Uncategorized' - if not articles.has_key(feed): - articles[feed] = [] - if not 'podcasts' in url: - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) - ans = self.sort_index_by(ans, {'The Front Page':-1, - 'Dining In, Dining Out':1, - 'Obituaries':2}) + + + # Find each instance of class="section-headline", class="story", class="story headline" + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + + if div['class'] in ['section-headline','sectionHeader']: + key = string.capwords(feed_title(div)) + articles[key] = [] + ans.append(key) + #self.log('Section: %s' % key) + + elif div['class'] in ['story', 'story headline'] : + handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + handle_article(lidiv) + +# ans = self.sort_index_by(ans, {'The Front Page':-1, +# 'Dining In, Dining Out':1, +# 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - self.dump_ans(ans) + return ans - def skip_ad_pages(self, soup): - # Skip ad pages served before actual article - skip_tag = soup.find(True, {'name':'skip'}) - if skip_tag is not None: - self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) - url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) - url += '?pagewanted=all' - self.log.warn("Skipping ad to article at '%s'" % url) - return self.index_to_soup(url, raw=True) - def preprocess_html(self, soup): - return self.strip_anchors(soup) + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: + tagline = self.tag_to_string(kicker_tag) + #self.log("FOUND KICKER %s" % tagline) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + #self.log("Searching for photo") + if img_div: + img_div.extract() + #self.log("Photo deleted") + refresh = soup.find('meta', {'http-equiv':'refresh'}) + if refresh is None: + return soup + content = refresh.get('content').partition('=')[2] + raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() + return BeautifulSoup(raw.decode('cp1252', 'replace')) - def postprocess_html(self,soup, True): - print "\npostprocess_html()\n" - - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class':'articleSpanImage'}) - inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg after headline - cgFirst = soup.find(True, {'class':'columnGroup first'}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll(text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag,'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc,firstImg) - else: - self.log(">>> No class:'columnGroup first' found <<<") - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker and kicker.contents and kicker.contents[0]: - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, - use_alt=False))) - kicker.replaceWith(h3Tag) - - # Change captions to italic -1 - for caption in soup.findAll(True, {'class':'caption'}) : - if caption and caption.contents[0]: - emTag = Tag(soup, "em") - c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - emTag.insert(0, c) - #hrTag = Tag(soup, 'hr') - #hrTag['class'] = 'caption_divider' - hrTag = Tag(soup, 'div') - hrTag['class'] = 'divider' - emTag.insert(1, hrTag) - caption.replaceWith(emTag) - - # Change to

- h1 = soup.find('h1') - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - else: - # Blog entry - replace headline, remove
tags - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - if articleTag: - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - - return soup - - def populate_article_metadata(self,article,soup,first): - ''' - Extract author and description from article, add to article metadata - ''' - def extract_author(soup): - byline = soup.find('meta',attrs={'name':['byl','CLMST']}) - if byline : - author = byline['content'] - else : - # Try for