New recipe for Neowin by Darko Miletic

This commit is contained in:
Kovid Goyal 2010-01-23 14:55:28 -07:00
parent 44129f0d1f
commit f1d8104450
3 changed files with 104 additions and 46 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 KiB

View File

@ -0,0 +1,40 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Neowin(BasicNewsRecipe):
title = u'Neowin.net'
oldest_article = 5
language = 'en'
description = 'News from IT'
publisher = 'Neowin'
category = 'news, IT, Microsoft, Apple, hardware, software, games'
__author__ = 'Darko Miletic'
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf8'
conversion_options = {
'tags' : category
,'language' : language
,'comments' : description
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags_after = dict(name='div', attrs={'id':'tag-bar'})
remove_tags = [
dict(name=['base','object','link','iframe'])
,dict(name='div', attrs={'id':'tag-bar'})
]
feeds = [
(u'Software' , u'http://www.neowin.net/news/rss/software' )
,(u'Gaming' , u'http://www.neowin.net/news/rss/gaming' )
,(u'Microsoft', u'http://www.neowin.net/news/rss/microsoft')
,(u'Apple' , u'http://www.neowin.net/news/rss/apple' )
,(u'Editorial', u'http://www.neowin.net/news/rss/editorial')
]
def image_url_processor(cls, baseurl, url):
return url

View File

@ -18,48 +18,11 @@ class Font(object):
self.color = spec.get('color') self.color = spec.get('color')
self.family = spec.get('family') self.family = spec.get('family')
class Column(object):
# A column contains an element is the element bulges out to
# the left or the right by at most HFUZZ*col width.
HFUZZ = 0.2
def __init__(self):
self.left = self.right = self.top = self.bottom = 0
self.width = self.height = 0
self.elements = []
self.average_line_separation = 0
def add(self, elem):
if elem in self.elements: return
self.elements.append(elem)
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
self.top = self.elements[0].top
self.bottom = self.elements[-1].bottom
self.left, self.right = sys.maxint, 0
for x in self:
self.left = min(self.left, x.left)
self.right = max(self.right, x.right)
self.width, self.height = self.right-self.left, self.bottom-self.top
def __iter__(self):
for x in self.elements:
yield x
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
def collect_stats(self):
if len(self.elements) > 1:
gaps = [self.elements[i+1].top - self.elements[i].bottom for i in
range(len(0, len(self.elements)-1))]
self.average_line_separation = sum(gaps)/len(gaps)
class Element(object): class Element(object):
def __init__(self): def __init__(self):
self.starts_paragraph = False self.starts_block = None
self.block_style = None
def __eq__(self, other): def __eq__(self, other):
return self.id == other.id return self.id == other.id
@ -152,6 +115,61 @@ class Interval(object):
def __hash__(self): def __hash__(self):
return hash('(%f,%f)'%self.left, self.right) return hash('(%f,%f)'%self.left, self.right)
class Column(object):
# A column contains an element is the element bulges out to
# the left or the right by at most HFUZZ*col width.
HFUZZ = 0.2
def __init__(self):
self.left = self.right = self.top = self.bottom = 0
self.width = self.height = 0
self.elements = []
self.average_line_separation = 0
def add(self, elem):
if elem in self.elements: return
self.elements.append(elem)
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
self.top = self.elements[0].top
self.bottom = self.elements[-1].bottom
self.left, self.right = sys.maxint, 0
for x in self:
self.left = min(self.left, x.left)
self.right = max(self.right, x.right)
self.width, self.height = self.right-self.left, self.bottom-self.top
def __iter__(self):
for x in self.elements:
yield x
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
def collect_stats(self):
if len(self.elements) > 1:
gaps = [self.elements[i+1].top - self.elements[i].bottom for i in
range(len(0, len(self.elements)-1))]
self.average_line_separation = sum(gaps)/len(gaps)
for i, elem in enumerate(self.elements):
left_margin = elem.left - self.left
elem.indent_fraction = left_margin/self.width
elem.width_fraction = elem.width/self.width
if i == 0:
elem.top_gap = None
else:
elem.top_gap = self.elements[i-1].bottom - elem.top
def previous_element(self, idx):
if idx == 0:
return None
return self.elements[idx-1]
class Region(object): class Region(object):
def __init__(self): def __init__(self):
@ -168,6 +186,7 @@ class Region(object):
self.columns[i].add(elem) self.columns[i].add(elem)
def contains(self, columns): def contains(self, columns):
# TODO: handle unbalanced columns
if not self.columns: if not self.columns:
return True return True
if len(columns) != len(self.columns): if len(columns) != len(self.columns):
@ -187,7 +206,7 @@ class Region(object):
return len(self.elements) == 0 return len(self.elements) == 0
def collect_stats(self): def collect_stats(self):
for column in self.column: for column in self.columns:
column.collect_stats() column.collect_stats()
self.average_line_separation = sum([x.average_line_separation for x in self.average_line_separation = sum([x.average_line_separation for x in
self.columns])/float(len(self.columns)) self.columns])/float(len(self.columns))
@ -196,11 +215,10 @@ class Region(object):
for x in self.columns: for x in self.columns:
yield x yield x
def detect_paragraphs(self): def linearize(self):
first = True self.elements = []
for col in self: for x in self.columns:
col.detect_paragraphs(self.average_line_separation, first) self.elements.extend(x)
first = False
class Page(object): class Page(object):
@ -332,7 +350,7 @@ class Page(object):
'Locate paragraph boundaries in each column' 'Locate paragraph boundaries in each column'
for region in self.regions: for region in self.regions:
region.collect_stats() region.collect_stats()
region.detect_paragraphs() region.linearize()
class PDFDocument(object): class PDFDocument(object):