New recipe for The New Zealand Herald by Krittika Goyal

This commit is contained in:
Kovid Goyal 2010-01-08 10:11:30 -07:00
parent 3840fa47cc
commit fc64d15b09
2 changed files with 129 additions and 6 deletions

View File

@ -0,0 +1,76 @@
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class NewZealandHerald(BasicNewsRecipe):
title = 'New Zealand Herald'
__author__ = 'Krittika Goyal'
description = 'Daily news'
timefmt = ' [%d %b, %Y]'
no_stylesheets = True
remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'})
remove_tags_after = dict(name='div', attrs={'class':'callToAction'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
#dict(name='div', attrs={'id':['shareContainer']}),
#dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
#dict(name='table', attrs={'cellspacing':'0'}),
]
def preprocess_html(self, soup):
table = soup.find('table')
if table is not None:
table.extract()
return soup
#TO GET ARTICLES IN SECTION
def nz_parse_section(self, url):
soup = self.index_to_soup(url)
div = soup.find(attrs={'class':'col-300 categoryList'})
date = div.find(attrs={'class':'link-list-heading'})
current_articles = []
for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}):
if x.get('class') == 'link-list-heading': break
for li in x.findAll('li'):
a = li.find('a', href=True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://www.nzherald.co.nz'+url
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
return current_articles
# To GET SECTIONS
def parse_index(self):
feeds = []
for title, url in [
('National',
'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'),
('World',
'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'),
('Politics',
'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'),
('Crime',
'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'),
('Environment',
'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'),
]:
articles = self.nz_parse_section(url)
if articles:
feeds.append((title, articles))
return feeds

View File

@ -18,6 +18,29 @@ class Font(object):
self.color = spec.get('color')
self.family = spec.get('family')
class Column(object):
def __init__(self):
self.left = self.right = self.top = self.bottom = 0
self.width = self.height = 0
self.elements = []
def add(self, elem):
if elem in self.elements: return
self.elements.append(elem)
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
self.top = self.elements[0].top
self.bottom = self.elements[-1].bottom
self.left, self.right = sys.maxint, 0
for x in self:
self.left = min(self.left, x.left)
self.right = max(self.right, x.right)
self.width, self.height = self.right-self.left, self.bottom-self.top
def __iter__(self):
for x in self.elements:
yield x
class Element(object):
def __eq__(self, other):
@ -37,7 +60,6 @@ class Image(Element):
self.src = img.get('src')
class Text(Element):
def __init__(self, text, font_map, opts, log, idc):
@ -191,18 +213,43 @@ class Page(object):
for i, x in enumerate(self.elements):
x.idx = i
self.current_region = None
processed = set([])
for x in self.elements:
self.find_elements_in_row_of(x)
if x in processed: continue
elems = set(self.find_elements_in_row_of(x))
columns = self.sort_into_columns(x, elems)
processed.update(elems)
columns
def sort_into_columns(self, elem, neighbors):
columns = [Column()]
columns[0].add(elem)
for x in neighbors:
added = False
for c in columns:
if c.contains(x):
c.add(x)
added = True
break
if not added:
columns.append(Column())
columns[-1].add(x)
columns.sort(cmp=lambda x,y:cmp(x.left, y.left))
return columns
def find_elements_in_row_of(self, x):
interval = Interval(x.top - self.YFUZZ * self.average_text_height,
x.top + self.YFUZZ*(1+self.average_text_height))
h_interval = Interval(x.left, x.right)
m = max(0, x.idx-15)
for y in self.elements[m:x.idx+15]:
y_interval = Interval(y.top, y.bottom)
if interval.intersection(y_interval).width > \
0.5*self.average_text_height:
yield y
if y is not x:
y_interval = Interval(y.top, y.bottom)
x_interval = Interval(y.left, y.right)
if interval.intersection(y_interval).width > \
0.5*self.average_text_height and \
x_interval.intersection(h_interval).width <= 0:
yield y
class PDFDocument(object):