diff --git a/resources/images/news/greensboro_news_and_record.png b/resources/images/news/greensboro_news_and_record.png new file mode 100644 index 0000000000..91097cd15b Binary files /dev/null and b/resources/images/news/greensboro_news_and_record.png differ diff --git a/resources/recipes/greensboro_news_and_record.recipe b/resources/recipes/greensboro_news_and_record.recipe new file mode 100644 index 0000000000..d6208f9fc2 --- /dev/null +++ b/resources/recipes/greensboro_news_and_record.recipe @@ -0,0 +1,54 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Walt Anthony ' +''' +www.news-record.com +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class NewsandRecord(BasicNewsRecipe): + title = u'Greensboro News & Record' + description = "News from Greensboro, North Carolina" + __author__ = 'Walt Anthony' + publisher = 'News & Record and Landmark Media Enterprises, LLC' + category = 'news, USA' + oldest_article = 3 #days + max_articles_per_feed = 25 + summary_length = 150 + language = 'en' + encoding = 'utf-8' + remove_javascript = True + no_stylesheets = True + + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + + + remove_tags_before = dict(name='h3', attrs={'class':'nrcTxt_headline'}) + remove_tags_after = dict(name='div', attrs={'id':'nrcBlk_ContentBody'}) + + remove_tags = [ + dict(name='iframe'), + dict(name=['notags','embed','object','link','img']), + + ] + + + feeds = [ + ('News', 'http://www.news-record.com/news/archive/feed'), + ('Greensboro News', 'http://www.news-record.com/news/greensboro/feed'), + ('Education', 'http://www.news-record.com/news/education/feed'), + ('Government', 'http://www.news-record.com/news/government/feed'), + ('College Sports', 'http://www.news-record.com/sports/college/feed'), + ('Sports Extra', 'http://www.news-record.com/blog/sportsextra/feed'), + ('Life', 'http://www.news-record.com/life/top/feed'), + ('NASCAR', 'http://www.news-record.com/sports/nascar/top/feed'), + ('Editorials', 'http://www.news-record.com/opinion/editorials/feed'), + ('Letters to the Editor', 'http://www.news-record.com/opinion/letters/feed') + ] + diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index ee60e779e4..92a0ceebe1 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -146,12 +146,14 @@ class Region(object): self.columns = [] self.top = self.bottom = self.left = self.right = self.width = self.height = 0 - def add_columns(self, columns): + def add(self, columns): if not self.columns: for x in sorted(columns, cmp=lambda x,y: cmp(x.left, y.left)): self.columns.append(x) else: - pass + for i in range(len(columns)): + for elem in columns[i]: + self.columns[i].add(elem) def contains(self, columns): if not self.columns: @@ -168,6 +170,11 @@ class Region(object): return False return True + @property + def is_empty(self): + return len(self.elements) == 0 + + class Page(object): # Fraction of a character width that two strings have to be apart, @@ -242,19 +249,25 @@ class Page(object): self.texts.remove(match) def first_pass(self): + 'Sort page into regions and columns' self.regions = [] if not self.elements: return for i, x in enumerate(self.elements): x.idx = i - self.current_region = None + current_region = Region() processed = set([]) for x in self.elements: if x in processed: continue elems = set(self.find_elements_in_row_of(x)) columns = self.sort_into_columns(x, elems) processed.update(elems) - columns + if not current_region.contains(columns): + self.regions.append(self.current_region) + current_region = Region() + current_region.add(columns) + if not self.current_region.is_empty(): + self.regions.append(current_region) def sort_into_columns(self, elem, neighbors): columns = [Column()]