diff --git a/resources/recipes/nzherald.recipe b/resources/recipes/nzherald.recipe new file mode 100644 index 0000000000..92572a58bc --- /dev/null +++ b/resources/recipes/nzherald.recipe @@ -0,0 +1,76 @@ +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class NewZealandHerald(BasicNewsRecipe): + + title = 'New Zealand Herald' + __author__ = 'Krittika Goyal' + description = 'Daily news' + timefmt = ' [%d %b, %Y]' + + no_stylesheets = True + remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'}) + remove_tags_after = dict(name='div', attrs={'class':'callToAction'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}), + #dict(name='div', attrs={'id':['shareContainer']}), + #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}), + #dict(name='table', attrs={'cellspacing':'0'}), + ] + + def preprocess_html(self, soup): + table = soup.find('table') + if table is not None: + table.extract() + return soup + + #TO GET ARTICLES IN SECTION + def nz_parse_section(self, url): + soup = self.index_to_soup(url) + div = soup.find(attrs={'class':'col-300 categoryList'}) + date = div.find(attrs={'class':'link-list-heading'}) + + current_articles = [] + for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}): + if x.get('class') == 'link-list-heading': break + for li in x.findAll('li'): + a = li.find('a', href=True) + if a is None: + continue + title = self.tag_to_string(a) + url = a.get('href', False) + if not url or not title: + continue + if url.startswith('/'): + url = 'http://www.nzherald.co.nz'+url + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + current_articles.append({'title': title, 'url':url, + 'description':'', 'date':''}) + + return current_articles + + + # To GET SECTIONS + def parse_index(self): + feeds = [] + for title, url in [ + ('National', + 'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'), + ('World', + 'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'), + ('Politics', + 'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'), + ('Crime', + 'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'), + ('Environment', + 'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'), + ]: + articles = self.nz_parse_section(url) + if articles: + feeds.append((title, articles)) + return feeds + diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index f8117021b5..1b2149cf3a 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -18,6 +18,29 @@ class Font(object): self.color = spec.get('color') self.family = spec.get('family') +class Column(object): + + def __init__(self): + self.left = self.right = self.top = self.bottom = 0 + self.width = self.height = 0 + self.elements = [] + + def add(self, elem): + if elem in self.elements: return + self.elements.append(elem) + self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom)) + self.top = self.elements[0].top + self.bottom = self.elements[-1].bottom + self.left, self.right = sys.maxint, 0 + for x in self: + self.left = min(self.left, x.left) + self.right = max(self.right, x.right) + self.width, self.height = self.right-self.left, self.bottom-self.top + + def __iter__(self): + for x in self.elements: + yield x + class Element(object): def __eq__(self, other): @@ -37,7 +60,6 @@ class Image(Element): self.src = img.get('src') - class Text(Element): def __init__(self, text, font_map, opts, log, idc): @@ -191,18 +213,43 @@ class Page(object): for i, x in enumerate(self.elements): x.idx = i self.current_region = None + processed = set([]) for x in self.elements: - self.find_elements_in_row_of(x) + if x in processed: continue + elems = set(self.find_elements_in_row_of(x)) + columns = self.sort_into_columns(x, elems) + processed.update(elems) + columns + + def sort_into_columns(self, elem, neighbors): + columns = [Column()] + columns[0].add(elem) + for x in neighbors: + added = False + for c in columns: + if c.contains(x): + c.add(x) + added = True + break + if not added: + columns.append(Column()) + columns[-1].add(x) + columns.sort(cmp=lambda x,y:cmp(x.left, y.left)) + return columns def find_elements_in_row_of(self, x): interval = Interval(x.top - self.YFUZZ * self.average_text_height, x.top + self.YFUZZ*(1+self.average_text_height)) + h_interval = Interval(x.left, x.right) m = max(0, x.idx-15) for y in self.elements[m:x.idx+15]: - y_interval = Interval(y.top, y.bottom) - if interval.intersection(y_interval).width > \ - 0.5*self.average_text_height: - yield y + if y is not x: + y_interval = Interval(y.top, y.bottom) + x_interval = Interval(y.left, y.right) + if interval.intersection(y_interval).width > \ + 0.5*self.average_text_height and \ + x_interval.intersection(h_interval).width <= 0: + yield y class PDFDocument(object):