Improved recipe for Wired Magazine

This commit is contained in:
Kovid Goyal 2010-01-23 10:58:00 -07:00
parent 26a9ec899f
commit 6e1fc23c47
2 changed files with 116 additions and 31 deletions

View File

@ -1,44 +1,105 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__docformat__ = 'restructuredtext en'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.wired.com
'''
import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Wired(BasicNewsRecipe):
title = 'Wired.com'
__author__ = 'Kovid Goyal'
description = 'Technology news'
timefmt = ' [%Y%b%d %H%M]'
language = 'en'
title = 'Wired Magazine'
__author__ = 'Darko Miletic'
description = 'Gaming news'
publisher = 'Conde Nast Digital'
category = 'news, games, IT, gadgets'
oldest_article = 32
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'en'
extra_css = ' body{font-family: sans-serif} .entryDescription li {display: inline; list-style-type: none} '
index = 'http://www.wired.com/magazine/'
remove_tags_before = dict(name='div', id='content')
remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
'footer', 'advertisement', 'blog_subscription_unit',
'brightcove_component']),
{'class':'entryActions'},
dict(name=['noscript', 'script'])]
preprocess_regexps = [(re.compile(r'<meta name="Title".*<title>', re.DOTALL|re.IGNORECASE),lambda match: '<title>')]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
feeds = [
('Top News', 'http://feeds.wired.com/wired/index'),
('Culture', 'http://feeds.wired.com/wired/culture'),
('Software', 'http://feeds.wired.com/wired/software'),
('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'),
('Gadgets', 'http://feeds.wired.com/wired/gadgets'),
('Cars', 'http://feeds.wired.com/wired/cars'),
('Entertainment', 'http://feeds.wired.com/wired/entertainment'),
('Gaming', 'http://feeds.wired.com/wired/gaming'),
('Science', 'http://feeds.wired.com/wired/science'),
('Med Tech', 'http://feeds.wired.com/wired/medtech'),
('Politics', 'http://feeds.wired.com/wired/politics'),
('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
('Commentary', 'http://feeds.wired.com/wired/commentary'),
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags_after = dict(name='div', attrs={'class':'tweetmeme_button'})
remove_tags = [
dict(name=['object','embed','iframe','link'])
,dict(name='div', attrs={'class':['podcast_storyboard','tweetmeme_button']})
]
#feeds = [(u'Articles' , u'http://www.wired.com/magazine/feed/' )]
def parse_index(self):
totalfeeds = []
soup = self.index_to_soup(self.index)
features = soup.find('div',attrs={'id':'my-glider'})
if features:
farticles = []
for item in features.findAll('div',attrs={'class':'section'}):
divurl = item.find('div',attrs={'class':'feature-header'})
divdesc = item.find('div',attrs={'class':'feature-text'})
url = 'http://www.wired.com' + divurl.a['href']
title = self.tag_to_string(divurl.a)
description = self.tag_to_string(divdesc)
date = strftime(self.timefmt)
farticles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append(('Featured Articles', farticles))
#department feeds
departments = ['rants','start','test','play','found']
dept = soup.find('div',attrs={'id':'magazine-departments'})
if dept:
for ditem in departments:
darticles = []
department = dept.find('div',attrs={'id':'department-'+ditem})
if department:
for item in department.findAll('div'):
description = ''
feed_link = item.find('a')
if feed_link and feed_link.has_key('href'):
url = feed_link['href']
title = self.tag_to_string(feed_link)
date = strftime(self.timefmt)
darticles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append((ditem.capitalize(), darticles))
return totalfeeds
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.index)
cover_item = soup.find('div',attrs={'class':'spread-image'})
if cover_item:
cover_url = 'http://www.wired.com' + cover_item.a.img['src']
return cover_url
def print_version(self, url):
return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
return url.rstrip('/') + '/all/1'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -28,6 +28,7 @@ class Column(object):
self.left = self.right = self.top = self.bottom = 0
self.width = self.height = 0
self.elements = []
self.average_line_separation = 0
def add(self, elem):
if elem in self.elements: return
@ -49,8 +50,17 @@ class Column(object):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
def collect_stats(self):
if len(self.elements) > 1:
gaps = [self.elements[i+1].top - self.elements[i].bottom for i in
range(len(0, len(self.elements)-1))]
self.average_line_separation = sum(gaps)/len(gaps)
class Element(object):
def __init__(self):
self.starts_paragraph = False
def __eq__(self, other):
return self.id == other.id
@ -60,6 +70,7 @@ class Element(object):
class Image(Element):
def __init__(self, img, opts, log, idc):
Element.__init__(self)
self.opts, self.log = opts, log
self.id = idc.next()
self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
@ -71,6 +82,7 @@ class Image(Element):
class Text(Element):
def __init__(self, text, font_map, opts, log, idc):
Element.__init__(self)
self.id = idc.next()
self.opts, self.log = opts, log
self.font_map = font_map
@ -174,6 +186,12 @@ class Region(object):
def is_empty(self):
return len(self.elements) == 0
def collect_stats(self):
for column in self.column:
column.collect_stats()
self.average_line_separation = sum([x.average_line_separation for x in
self.columns])/float(len(self.columns))
class Page(object):
@ -298,6 +316,11 @@ class Page(object):
x_interval.intersection(h_interval).width <= 0:
yield y
def second_pass(self):
'Locate paragraph boundaries in each column'
for region in self.regions:
region.collect_stats()
class PDFDocument(object):
@ -327,6 +350,7 @@ class PDFDocument(object):
for page in self.pages:
page.document_font_stats = self.font_size_stats
page.first_pass()
page.second_pass()
def collect_font_statistics(self):
self.font_size_stats = {}