diff --git a/resources/recipes/welt.recipe b/resources/recipes/welt.recipe index b1bf20235d..4c90853c82 100644 --- a/resources/recipes/welt.recipe +++ b/resources/recipes/welt.recipe @@ -15,12 +15,13 @@ class weltDe(BasicNewsRecipe): __author__ = 'Oliver Niesner' use_embedded_content = False timefmt = ' [%d %b %Y]' - max_articles_per_feed = 15 # reduced to this value to prevent too many articles (suggested by Gregory Riker + max_articles_per_feed = 15 + linearize_tables = True no_stylesheets = True remove_stylesheets = True remove_javascript = True - language = 'de' encoding = 'iso-8859-1' + BasicNewsRecipe.summary_length = 200 remove_tags = [dict(id='jumplinks'), @@ -43,10 +44,14 @@ class weltDe(BasicNewsRecipe): dict(id='servicesBox'), dict(id='toggleAdvancedSearch'), dict(id='mainNav'), - dict(id='ratingBox5136466_1'), - dict(id='ratingBox5136466_2'), dict(id='articleInlineMediaBox0'), dict(id='sectionSponsor'), + dict(id='sprucharea'), + dict(id='xmsg_recommendEmail'), + dict(id='xmsg_recommendSms'), + dict(id='xmsg_comment'), + dict(id='additionalNavWrapper'), + dict(id='imagebox'), #dict(id=''), dict(name='span'), dict(name='div', attrs={'class':'printURL'}), @@ -65,10 +70,21 @@ class weltDe(BasicNewsRecipe): dict(name='ul', attrs={'class':'optionsSubNav clear'}), dict(name='li', attrs={'class':'next'}), dict(name='li', attrs={'class':'prev'}), + dict(name='li', attrs={'class':'last'}), + dict(name='table', attrs={'class':'textGallery'}), dict(name='li', attrs={'class':'active'})] remove_tags_after = [dict(id='tw_link_widget')] + extra_css = ''' + h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;} + a{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-style:italic;} + .dachzeile p{font-family:Arial,Helvetica,sans-serif; font-size: x-small; } + h1{ font-family:Arial,Helvetica,sans-serif; font-size:x-large; font-weight:bold;} + .artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; } + body{font-family:Arial,Helvetica,sans-serif; } + .photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;} ''' + feeds = [ ('Politik', 'http://welt.de/politik/?service=Rss'), ('Deutsche Dinge', 'http://www.welt.de/deutsche-dinge/?service=Rss'), ('Wirtschaft', 'http://welt.de/wirtschaft/?service=Rss'), diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 53be9a23de..72ca7f0d1c 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -78,7 +78,7 @@ class HorizontalBox(object): def append(self, t): self.texts.append(t) - def sort(self): + def sort(self, left_margin, right_margin): self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left)) self.top, self.bottom = sys.maxint, 0 for t in self.texts: @@ -86,6 +86,27 @@ class HorizontalBox(object): self.bottom = max(self.bottom, t.bottom) self.left = self.texts[0].left self.right = self.texts[-1].right + self.gaps = [] + for i, t in enumerate(self.texts[1:]): + gap = Interval(self.texts[i].right, t.left) + if gap.width > 3: + self.gaps.append(gap) + left = Interval(left_margin, self.texts[0].left) + if left.width > 3: + self.gaps.insert(0, left) + right = Interval(self.texts[-1].right, right_margin) + if right.width > 3: + self.gaps.append(right) + + def has_intersection_with(self, gap): + for g in self.gaps: + if g.intersection(gap): + return True + return False + + def identify_columns(self, column_gaps): + self.number_of_columns = len(column_gaps) + 1 + class Page(object): @@ -138,19 +159,24 @@ class Page(object): for hb in self.horizontal_boxes: - hb.sort() + hb.sort(self.left_margin, self.right_margin) self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom)) def identify_columns(self): def neighborhood(i): - if i == 0: - return self.horizontal_boxes[1:3] - return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1]) + if i == len(self.horizontal_boxes)-1: + return self.horizontal_boxes[i-2:i] + if i == len(self.horizontal_boxes)-2: + return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1]) + return self.horizontal_boxes[i+1], self.horizontal_boxes[i+2] for i, hbox in enumerate(self.horizontal_boxes): - pass + n1, n2 = neighborhood(i) + for gap in hbox.gaps: + gap.is_column_gap = n1.has_intersection_with(gap) and \ + n2.has_intersection_with(gap)