mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for Neowin by Darko Miletic
This commit is contained in:
parent
44129f0d1f
commit
f1d8104450
BIN
resources/images/news/neowin.png
Normal file
BIN
resources/images/news/neowin.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.0 KiB |
40
resources/recipes/neowin.recipe
Normal file
40
resources/recipes/neowin.recipe
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Neowin(BasicNewsRecipe):
|
||||||
|
title = u'Neowin.net'
|
||||||
|
oldest_article = 5
|
||||||
|
language = 'en'
|
||||||
|
description = 'News from IT'
|
||||||
|
publisher = 'Neowin'
|
||||||
|
category = 'news, IT, Microsoft, Apple, hardware, software, games'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'comments' : description
|
||||||
|
,'publisher' : publisher
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
|
||||||
|
remove_tags_after = dict(name='div', attrs={'id':'tag-bar'})
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['base','object','link','iframe'])
|
||||||
|
,dict(name='div', attrs={'id':'tag-bar'})
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Software' , u'http://www.neowin.net/news/rss/software' )
|
||||||
|
,(u'Gaming' , u'http://www.neowin.net/news/rss/gaming' )
|
||||||
|
,(u'Microsoft', u'http://www.neowin.net/news/rss/microsoft')
|
||||||
|
,(u'Apple' , u'http://www.neowin.net/news/rss/apple' )
|
||||||
|
,(u'Editorial', u'http://www.neowin.net/news/rss/editorial')
|
||||||
|
]
|
||||||
|
def image_url_processor(cls, baseurl, url):
|
||||||
|
return url
|
||||||
|
|
@ -18,48 +18,11 @@ class Font(object):
|
|||||||
self.color = spec.get('color')
|
self.color = spec.get('color')
|
||||||
self.family = spec.get('family')
|
self.family = spec.get('family')
|
||||||
|
|
||||||
class Column(object):
|
|
||||||
|
|
||||||
# A column contains an element is the element bulges out to
|
|
||||||
# the left or the right by at most HFUZZ*col width.
|
|
||||||
HFUZZ = 0.2
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.left = self.right = self.top = self.bottom = 0
|
|
||||||
self.width = self.height = 0
|
|
||||||
self.elements = []
|
|
||||||
self.average_line_separation = 0
|
|
||||||
|
|
||||||
def add(self, elem):
|
|
||||||
if elem in self.elements: return
|
|
||||||
self.elements.append(elem)
|
|
||||||
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
|
|
||||||
self.top = self.elements[0].top
|
|
||||||
self.bottom = self.elements[-1].bottom
|
|
||||||
self.left, self.right = sys.maxint, 0
|
|
||||||
for x in self:
|
|
||||||
self.left = min(self.left, x.left)
|
|
||||||
self.right = max(self.right, x.right)
|
|
||||||
self.width, self.height = self.right-self.left, self.bottom-self.top
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for x in self.elements:
|
|
||||||
yield x
|
|
||||||
|
|
||||||
def contains(self, elem):
|
|
||||||
return elem.left > self.left - self.HFUZZ*self.width and \
|
|
||||||
elem.right < self.right + self.HFUZZ*self.width
|
|
||||||
|
|
||||||
def collect_stats(self):
|
|
||||||
if len(self.elements) > 1:
|
|
||||||
gaps = [self.elements[i+1].top - self.elements[i].bottom for i in
|
|
||||||
range(len(0, len(self.elements)-1))]
|
|
||||||
self.average_line_separation = sum(gaps)/len(gaps)
|
|
||||||
|
|
||||||
class Element(object):
|
class Element(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.starts_paragraph = False
|
self.starts_block = None
|
||||||
|
self.block_style = None
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return self.id == other.id
|
return self.id == other.id
|
||||||
@ -152,6 +115,61 @@ class Interval(object):
|
|||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return hash('(%f,%f)'%self.left, self.right)
|
return hash('(%f,%f)'%self.left, self.right)
|
||||||
|
|
||||||
|
class Column(object):
|
||||||
|
|
||||||
|
# A column contains an element is the element bulges out to
|
||||||
|
# the left or the right by at most HFUZZ*col width.
|
||||||
|
HFUZZ = 0.2
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.left = self.right = self.top = self.bottom = 0
|
||||||
|
self.width = self.height = 0
|
||||||
|
self.elements = []
|
||||||
|
self.average_line_separation = 0
|
||||||
|
|
||||||
|
def add(self, elem):
|
||||||
|
if elem in self.elements: return
|
||||||
|
self.elements.append(elem)
|
||||||
|
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
|
||||||
|
self.top = self.elements[0].top
|
||||||
|
self.bottom = self.elements[-1].bottom
|
||||||
|
self.left, self.right = sys.maxint, 0
|
||||||
|
for x in self:
|
||||||
|
self.left = min(self.left, x.left)
|
||||||
|
self.right = max(self.right, x.right)
|
||||||
|
self.width, self.height = self.right-self.left, self.bottom-self.top
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for x in self.elements:
|
||||||
|
yield x
|
||||||
|
|
||||||
|
def contains(self, elem):
|
||||||
|
return elem.left > self.left - self.HFUZZ*self.width and \
|
||||||
|
elem.right < self.right + self.HFUZZ*self.width
|
||||||
|
|
||||||
|
def collect_stats(self):
|
||||||
|
if len(self.elements) > 1:
|
||||||
|
gaps = [self.elements[i+1].top - self.elements[i].bottom for i in
|
||||||
|
range(len(0, len(self.elements)-1))]
|
||||||
|
self.average_line_separation = sum(gaps)/len(gaps)
|
||||||
|
for i, elem in enumerate(self.elements):
|
||||||
|
left_margin = elem.left - self.left
|
||||||
|
elem.indent_fraction = left_margin/self.width
|
||||||
|
elem.width_fraction = elem.width/self.width
|
||||||
|
if i == 0:
|
||||||
|
elem.top_gap = None
|
||||||
|
else:
|
||||||
|
elem.top_gap = self.elements[i-1].bottom - elem.top
|
||||||
|
|
||||||
|
def previous_element(self, idx):
|
||||||
|
if idx == 0:
|
||||||
|
return None
|
||||||
|
return self.elements[idx-1]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Region(object):
|
class Region(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -168,6 +186,7 @@ class Region(object):
|
|||||||
self.columns[i].add(elem)
|
self.columns[i].add(elem)
|
||||||
|
|
||||||
def contains(self, columns):
|
def contains(self, columns):
|
||||||
|
# TODO: handle unbalanced columns
|
||||||
if not self.columns:
|
if not self.columns:
|
||||||
return True
|
return True
|
||||||
if len(columns) != len(self.columns):
|
if len(columns) != len(self.columns):
|
||||||
@ -187,7 +206,7 @@ class Region(object):
|
|||||||
return len(self.elements) == 0
|
return len(self.elements) == 0
|
||||||
|
|
||||||
def collect_stats(self):
|
def collect_stats(self):
|
||||||
for column in self.column:
|
for column in self.columns:
|
||||||
column.collect_stats()
|
column.collect_stats()
|
||||||
self.average_line_separation = sum([x.average_line_separation for x in
|
self.average_line_separation = sum([x.average_line_separation for x in
|
||||||
self.columns])/float(len(self.columns))
|
self.columns])/float(len(self.columns))
|
||||||
@ -196,11 +215,10 @@ class Region(object):
|
|||||||
for x in self.columns:
|
for x in self.columns:
|
||||||
yield x
|
yield x
|
||||||
|
|
||||||
def detect_paragraphs(self):
|
def linearize(self):
|
||||||
first = True
|
self.elements = []
|
||||||
for col in self:
|
for x in self.columns:
|
||||||
col.detect_paragraphs(self.average_line_separation, first)
|
self.elements.extend(x)
|
||||||
first = False
|
|
||||||
|
|
||||||
|
|
||||||
class Page(object):
|
class Page(object):
|
||||||
@ -332,7 +350,7 @@ class Page(object):
|
|||||||
'Locate paragraph boundaries in each column'
|
'Locate paragraph boundaries in each column'
|
||||||
for region in self.regions:
|
for region in self.regions:
|
||||||
region.collect_stats()
|
region.collect_stats()
|
||||||
region.detect_paragraphs()
|
region.linearize()
|
||||||
|
|
||||||
|
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user