mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for The New Zealand Herald by Krittika Goyal
This commit is contained in:
parent
3840fa47cc
commit
fc64d15b09
76
resources/recipes/nzherald.recipe
Normal file
76
resources/recipes/nzherald.recipe
Normal file
@ -0,0 +1,76 @@
|
||||
import string, re
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class NewZealandHerald(BasicNewsRecipe):
|
||||
|
||||
title = 'New Zealand Herald'
|
||||
__author__ = 'Krittika Goyal'
|
||||
description = 'Daily news'
|
||||
timefmt = ' [%d %b, %Y]'
|
||||
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'})
|
||||
remove_tags_after = dict(name='div', attrs={'class':'callToAction'})
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
|
||||
#dict(name='div', attrs={'id':['shareContainer']}),
|
||||
#dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
|
||||
#dict(name='table', attrs={'cellspacing':'0'}),
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
table = soup.find('table')
|
||||
if table is not None:
|
||||
table.extract()
|
||||
return soup
|
||||
|
||||
#TO GET ARTICLES IN SECTION
|
||||
def nz_parse_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
div = soup.find(attrs={'class':'col-300 categoryList'})
|
||||
date = div.find(attrs={'class':'link-list-heading'})
|
||||
|
||||
current_articles = []
|
||||
for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}):
|
||||
if x.get('class') == 'link-list-heading': break
|
||||
for li in x.findAll('li'):
|
||||
a = li.find('a', href=True)
|
||||
if a is None:
|
||||
continue
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
if not url or not title:
|
||||
continue
|
||||
if url.startswith('/'):
|
||||
url = 'http://www.nzherald.co.nz'+url
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
'description':'', 'date':''})
|
||||
|
||||
return current_articles
|
||||
|
||||
|
||||
# To GET SECTIONS
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for title, url in [
|
||||
('National',
|
||||
'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'),
|
||||
('World',
|
||||
'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'),
|
||||
('Politics',
|
||||
'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'),
|
||||
('Crime',
|
||||
'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'),
|
||||
('Environment',
|
||||
'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'),
|
||||
]:
|
||||
articles = self.nz_parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
@ -18,6 +18,29 @@ class Font(object):
|
||||
self.color = spec.get('color')
|
||||
self.family = spec.get('family')
|
||||
|
||||
class Column(object):
|
||||
|
||||
def __init__(self):
|
||||
self.left = self.right = self.top = self.bottom = 0
|
||||
self.width = self.height = 0
|
||||
self.elements = []
|
||||
|
||||
def add(self, elem):
|
||||
if elem in self.elements: return
|
||||
self.elements.append(elem)
|
||||
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
|
||||
self.top = self.elements[0].top
|
||||
self.bottom = self.elements[-1].bottom
|
||||
self.left, self.right = sys.maxint, 0
|
||||
for x in self:
|
||||
self.left = min(self.left, x.left)
|
||||
self.right = max(self.right, x.right)
|
||||
self.width, self.height = self.right-self.left, self.bottom-self.top
|
||||
|
||||
def __iter__(self):
|
||||
for x in self.elements:
|
||||
yield x
|
||||
|
||||
class Element(object):
|
||||
|
||||
def __eq__(self, other):
|
||||
@ -37,7 +60,6 @@ class Image(Element):
|
||||
self.src = img.get('src')
|
||||
|
||||
|
||||
|
||||
class Text(Element):
|
||||
|
||||
def __init__(self, text, font_map, opts, log, idc):
|
||||
@ -191,18 +213,43 @@ class Page(object):
|
||||
for i, x in enumerate(self.elements):
|
||||
x.idx = i
|
||||
self.current_region = None
|
||||
processed = set([])
|
||||
for x in self.elements:
|
||||
self.find_elements_in_row_of(x)
|
||||
if x in processed: continue
|
||||
elems = set(self.find_elements_in_row_of(x))
|
||||
columns = self.sort_into_columns(x, elems)
|
||||
processed.update(elems)
|
||||
columns
|
||||
|
||||
def sort_into_columns(self, elem, neighbors):
|
||||
columns = [Column()]
|
||||
columns[0].add(elem)
|
||||
for x in neighbors:
|
||||
added = False
|
||||
for c in columns:
|
||||
if c.contains(x):
|
||||
c.add(x)
|
||||
added = True
|
||||
break
|
||||
if not added:
|
||||
columns.append(Column())
|
||||
columns[-1].add(x)
|
||||
columns.sort(cmp=lambda x,y:cmp(x.left, y.left))
|
||||
return columns
|
||||
|
||||
def find_elements_in_row_of(self, x):
|
||||
interval = Interval(x.top - self.YFUZZ * self.average_text_height,
|
||||
x.top + self.YFUZZ*(1+self.average_text_height))
|
||||
h_interval = Interval(x.left, x.right)
|
||||
m = max(0, x.idx-15)
|
||||
for y in self.elements[m:x.idx+15]:
|
||||
y_interval = Interval(y.top, y.bottom)
|
||||
if interval.intersection(y_interval).width > \
|
||||
0.5*self.average_text_height:
|
||||
yield y
|
||||
if y is not x:
|
||||
y_interval = Interval(y.top, y.bottom)
|
||||
x_interval = Interval(y.left, y.right)
|
||||
if interval.intersection(y_interval).width > \
|
||||
0.5*self.average_text_height and \
|
||||
x_interval.intersection(h_interval).width <= 0:
|
||||
yield y
|
||||
|
||||
|
||||
class PDFDocument(object):
|
||||
|
Loading…
x
Reference in New Issue
Block a user