Improved recipe for Welt

This commit is contained in:
Kovid Goyal 2009-12-16 08:32:21 -07:00
parent d928b34798
commit 02e372769d
2 changed files with 52 additions and 10 deletions

View File

@ -15,12 +15,13 @@ class weltDe(BasicNewsRecipe):
__author__ = 'Oliver Niesner' __author__ = 'Oliver Niesner'
use_embedded_content = False use_embedded_content = False
timefmt = ' [%d %b %Y]' timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15 # reduced to this value to prevent too many articles (suggested by Gregory Riker max_articles_per_feed = 15
linearize_tables = True
no_stylesheets = True no_stylesheets = True
remove_stylesheets = True remove_stylesheets = True
remove_javascript = True remove_javascript = True
language = 'de'
encoding = 'iso-8859-1' encoding = 'iso-8859-1'
BasicNewsRecipe.summary_length = 200
remove_tags = [dict(id='jumplinks'), remove_tags = [dict(id='jumplinks'),
@ -43,10 +44,14 @@ class weltDe(BasicNewsRecipe):
dict(id='servicesBox'), dict(id='servicesBox'),
dict(id='toggleAdvancedSearch'), dict(id='toggleAdvancedSearch'),
dict(id='mainNav'), dict(id='mainNav'),
dict(id='ratingBox5136466_1'),
dict(id='ratingBox5136466_2'),
dict(id='articleInlineMediaBox0'), dict(id='articleInlineMediaBox0'),
dict(id='sectionSponsor'), dict(id='sectionSponsor'),
dict(id='sprucharea'),
dict(id='xmsg_recommendEmail'),
dict(id='xmsg_recommendSms'),
dict(id='xmsg_comment'),
dict(id='additionalNavWrapper'),
dict(id='imagebox'),
#dict(id=''), #dict(id=''),
dict(name='span'), dict(name='span'),
dict(name='div', attrs={'class':'printURL'}), dict(name='div', attrs={'class':'printURL'}),
@ -65,10 +70,21 @@ class weltDe(BasicNewsRecipe):
dict(name='ul', attrs={'class':'optionsSubNav clear'}), dict(name='ul', attrs={'class':'optionsSubNav clear'}),
dict(name='li', attrs={'class':'next'}), dict(name='li', attrs={'class':'next'}),
dict(name='li', attrs={'class':'prev'}), dict(name='li', attrs={'class':'prev'}),
dict(name='li', attrs={'class':'last'}),
dict(name='table', attrs={'class':'textGallery'}),
dict(name='li', attrs={'class':'active'})] dict(name='li', attrs={'class':'active'})]
remove_tags_after = [dict(id='tw_link_widget')] remove_tags_after = [dict(id='tw_link_widget')]
extra_css = '''
h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;}
a{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-style:italic;}
.dachzeile p{font-family:Arial,Helvetica,sans-serif; font-size: x-small; }
h1{ font-family:Arial,Helvetica,sans-serif; font-size:x-large; font-weight:bold;}
.artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; }
body{font-family:Arial,Helvetica,sans-serif; }
.photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;} '''
feeds = [ ('Politik', 'http://welt.de/politik/?service=Rss'), feeds = [ ('Politik', 'http://welt.de/politik/?service=Rss'),
('Deutsche Dinge', 'http://www.welt.de/deutsche-dinge/?service=Rss'), ('Deutsche Dinge', 'http://www.welt.de/deutsche-dinge/?service=Rss'),
('Wirtschaft', 'http://welt.de/wirtschaft/?service=Rss'), ('Wirtschaft', 'http://welt.de/wirtschaft/?service=Rss'),

View File

@ -78,7 +78,7 @@ class HorizontalBox(object):
def append(self, t): def append(self, t):
self.texts.append(t) self.texts.append(t)
def sort(self): def sort(self, left_margin, right_margin):
self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left)) self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left))
self.top, self.bottom = sys.maxint, 0 self.top, self.bottom = sys.maxint, 0
for t in self.texts: for t in self.texts:
@ -86,6 +86,27 @@ class HorizontalBox(object):
self.bottom = max(self.bottom, t.bottom) self.bottom = max(self.bottom, t.bottom)
self.left = self.texts[0].left self.left = self.texts[0].left
self.right = self.texts[-1].right self.right = self.texts[-1].right
self.gaps = []
for i, t in enumerate(self.texts[1:]):
gap = Interval(self.texts[i].right, t.left)
if gap.width > 3:
self.gaps.append(gap)
left = Interval(left_margin, self.texts[0].left)
if left.width > 3:
self.gaps.insert(0, left)
right = Interval(self.texts[-1].right, right_margin)
if right.width > 3:
self.gaps.append(right)
def has_intersection_with(self, gap):
for g in self.gaps:
if g.intersection(gap):
return True
return False
def identify_columns(self, column_gaps):
self.number_of_columns = len(column_gaps) + 1
class Page(object): class Page(object):
@ -138,19 +159,24 @@ class Page(object):
for hb in self.horizontal_boxes: for hb in self.horizontal_boxes:
hb.sort() hb.sort(self.left_margin, self.right_margin)
self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom)) self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom))
def identify_columns(self): def identify_columns(self):
def neighborhood(i): def neighborhood(i):
if i == 0: if i == len(self.horizontal_boxes)-1:
return self.horizontal_boxes[1:3] return self.horizontal_boxes[i-2:i]
if i == len(self.horizontal_boxes)-2:
return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1]) return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1])
return self.horizontal_boxes[i+1], self.horizontal_boxes[i+2]
for i, hbox in enumerate(self.horizontal_boxes): for i, hbox in enumerate(self.horizontal_boxes):
pass n1, n2 = neighborhood(i)
for gap in hbox.gaps:
gap.is_column_gap = n1.has_intersection_with(gap) and \
n2.has_intersection_with(gap)