calibre/recipes/new_yorker.recipe
2013-04-07 09:11:05 +05:30

281 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class NewYorker(BasicNewsRecipe):
title = u'New Yorker Magazine'
newyorker_prefix = 'http://m.newyorker.com'
description = u'Content from the New Yorker website'
fp_tag = 'CAN_TC'
masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
compress_news_images = True
compress_news_images_auto_size = 8
scale_news_images_to_device = False
scale_news_images = (768, 1024)
url_list = []
language = 'en'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
encoding = 'utf-8'
extra_css = '''
.byline { font-size:xx-small; font-weight: bold;}
h3 { margin-bottom: 6px; }
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
'''
keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
def get_cover_url(self):
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine')
cover_item = soup.find('div',attrs={'id':'media-count-1'})
if cover_item:
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
return cover_url
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","",fixed)
fixed = re.sub("’","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
shortparagraph = ""
## try:
if len(article.text_summary.strip()) == 0:
articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
paras = articlebody.findAll('p')
for p in paras:
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text
newpara = shortparagraph + refparagraph
article.summary = article.text_summary = newpara.strip()
return
else:
shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- "
else:
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
## except:
## self.log("Error creating article descriptions")
## return
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self,soup):
dateline = soup.find('div','published')
byline = soup.find('div','byline')
title = soup.find('h1','entry-title')
if title is None:
return self.strip_anchors(soup)
if byline is None:
title.append(dateline)
return self.strip_anchors(soup)
byline.append(dateline)
return self.strip_anchors(soup)
def load_global_nav(self,soup):
seclist = []
ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
if ul is not None:
for li in ul.findAll('li'):
if li.a is not None:
securl = li.a['href']
if securl != '/' and securl != '/magazine' and securl.startswith('/'):
seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
return seclist
def exclude_url(self,url):
if url in self.url_list:
return True
if not url.endswith('html'):
return True
if 'goings-on-about-town-app' in url:
return True
if 'something-to-be-thankful-for' in url:
return True
if '/shouts/' in url:
return True
if 'out-loud' in url:
return True
if '/rss/' in url:
return True
if '/video-' in url:
return True
self.url_list.append(url)
return False
def load_index_page(self,soup):
article_list = []
for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
h2 = div.h2
if h2 is not None:
a = h2.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
byline = h2.span
if byline is not None:
author = self.tag_to_string(byline)
if author.startswith('by '):
author.replace('by ','')
byline.extract()
else:
author = ''
if h2.br is not None:
h2.br.replaceWith(' ')
title = self.tag_to_string(h2)
desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
if desc is not None:
description = self.tag_to_string(desc)
else:
description = ''
article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
ul = div.find('ul','feature-blurb-links')
if ul is not None:
for li in ul.findAll('li'):
a = li.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
if a.br is not None:
a.br.replaceWith(' ')
title = '>>'+self.tag_to_string(a)
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
for h3 in soup.findAll('h3','header'):
a = h3.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
byline = h3.span
if byline is not None:
author = self.tag_to_string(byline)
if author.startswith('by '):
author = author.replace('by ','')
byline.extract()
else:
author = ''
if h3.br is not None:
h3.br.replaceWith(' ')
title = self.tag_to_string(h3).strip()
article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
return article_list
def load_global_section(self,securl):
article_list = []
try:
soup = self.index_to_soup(securl)
except:
return article_list
if '/blogs/' not in securl:
return self.load_index_page(soup)
for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
h3 = div.h3
if h3 is not None:
a = h3.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
if h3.br is not None:
h3.br.replaceWith(' ')
title = self.tag_to_string(h3)
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
return article_list
def filter_ans(self, ans) :
total_article_count = 0
idx = 0
idx_max = len(ans)-1
while idx <= idx_max:
if True: #self.verbose
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]:
total_article_count += 1
if True: #self.verbose
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
idx = idx+1
self.log( "Queued %d articles" % total_article_count )
return ans
def parse_index(self):
ans = []
try:
soup = self.index_to_soup(self.newyorker_prefix)
except:
return ans
seclist = self.load_global_nav(soup)
ans.append(('Front Page',self.load_index_page(soup)))
for (sectitle,securl) in seclist:
ans.append((sectitle,self.load_global_section(securl)))
return self.filter_ans(ans)