New recipe for The New Zealand Herald by Krittika Goyal

2025-08-30 23:00:21 -04:00 · 2010-01-08 10:11:30 -07:00 · 2010-01-08 10:11:30 -07:00 · fc64d15b09
commit fc64d15b09
parent 3840fa47cc
2 changed files with 129 additions and 6 deletions
--- a/resources/recipes/nzherald.recipe
+++ b/resources/recipes/nzherald.recipe
@ -0,0 +1,76 @@
+import string, re
+from calibre import strftime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class NewZealandHerald(BasicNewsRecipe):
+
+    title       = 'New Zealand Herald'
+    __author__  = 'Krittika Goyal'
+    description = 'Daily news'
+    timefmt = ' [%d %b, %Y]'
+    
+    no_stylesheets = True
+    remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'})
+    remove_tags_after  = dict(name='div', attrs={'class':'callToAction'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
+       #dict(name='div', attrs={'id':['shareContainer']}),
+       #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
+       #dict(name='table', attrs={'cellspacing':'0'}),	
+    ]    
+
+    def preprocess_html(self, soup):
+        table = soup.find('table')
+        if table is not None:
+            table.extract()
+        return soup
+
+    #TO GET ARTICLES IN SECTION
+    def nz_parse_section(self, url):
+            soup = self.index_to_soup(url)
+            div = soup.find(attrs={'class':'col-300 categoryList'})
+            date = div.find(attrs={'class':'link-list-heading'})
+            
+            current_articles = []
+            for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}):
+                if x.get('class') == 'link-list-heading': break
+                for li in x.findAll('li'):
+                    a = li.find('a', href=True)
+                    if a is None:
+                        continue
+                    title = self.tag_to_string(a)
+                    url = a.get('href', False)
+                    if not url or not title:
+                        continue
+                    if url.startswith('/'):
+                         url = 'http://www.nzherald.co.nz'+url
+                    self.log('\t\tFound article:', title)
+                    self.log('\t\t\t', url)
+                    current_articles.append({'title': title, 'url':url,
+                        'description':'', 'date':''})
+
+            return current_articles
+  
+    
+    # To GET SECTIONS
+    def parse_index(self):
+            feeds = []
+            for title, url in [
+                ('National',
+                 'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'),
+                ('World',
+                 'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'),
+                ('Politics',
+                 'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'),
+                ('Crime',
+                 'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'),
+                ('Environment',
+                 'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'),                
+             ]:
+               articles = self.nz_parse_section(url)
+               if articles:
+                   feeds.append((title, articles))
+            return feeds
+
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -18,6 +18,29 @@ class Font(object):
        self.color = spec.get('color')
        self.family = spec.get('family')

+class Column(object):
+
+    def __init__(self):
+        self.left = self.right = self.top = self.bottom = 0
+        self.width = self.height = 0
+        self.elements = []
+
+    def add(self, elem):
+        if elem in self.elements: return
+        self.elements.append(elem)
+        self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
+        self.top = self.elements[0].top
+        self.bottom = self.elements[-1].bottom
+        self.left, self.right = sys.maxint, 0
+        for x in self:
+            self.left = min(self.left, x.left)
+            self.right = max(self.right, x.right)
+        self.width, self.height = self.right-self.left, self.bottom-self.top
+
+    def __iter__(self):
+        for x in self.elements:
+            yield x
+
 class Element(object):

    def __eq__(self, other):
@ -37,7 +60,6 @@ class Image(Element):
        self.src = img.get('src')


-
 class Text(Element):

    def __init__(self, text, font_map, opts, log, idc):
@ -191,18 +213,43 @@ class Page(object):
        for i, x in enumerate(self.elements):
            x.idx = i
        self.current_region = None
+        processed = set([])
        for x in self.elements:
-            self.find_elements_in_row_of(x)
+            if x in processed: continue
+            elems = set(self.find_elements_in_row_of(x))
+            columns = self.sort_into_columns(x, elems)
+            processed.update(elems)
+            columns
+
+    def sort_into_columns(self, elem, neighbors):
+        columns = [Column()]
+        columns[0].add(elem)
+        for x in neighbors:
+            added = False
+            for c in columns:
+                if c.contains(x):
+                    c.add(x)
+                    added = True
+                    break
+            if not added:
+                columns.append(Column())
+                columns[-1].add(x)
+                columns.sort(cmp=lambda x,y:cmp(x.left, y.left))
+        return columns

    def find_elements_in_row_of(self, x):
        interval = Interval(x.top - self.YFUZZ * self.average_text_height,
                x.top + self.YFUZZ*(1+self.average_text_height))
+        h_interval = Interval(x.left, x.right)
        m = max(0, x.idx-15)
        for y in self.elements[m:x.idx+15]:
-            y_interval = Interval(y.top, y.bottom)
-            if interval.intersection(y_interval).width > \
-                0.5*self.average_text_height:
-                yield y
+            if y is not x:
+                y_interval = Interval(y.top, y.bottom)
+                x_interval = Interval(y.left, y.right)
+                if interval.intersection(y_interval).width > \
+                    0.5*self.average_text_height and \
+                    x_interval.intersection(h_interval).width <= 0:
+                    yield y


 class PDFDocument(object):