diff --git a/resources/recipes/the_gazette.recipe b/resources/recipes/the_gazette.recipe deleted file mode 100644 index 19afff986e..0000000000 --- a/resources/recipes/the_gazette.recipe +++ /dev/null @@ -1,22 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class The_Gazette(BasicNewsRecipe): - - cover_url = 'file:///D:/Documents/Pictures/Covers/The_Gazette.jpg' - title = u'The Gazette' - __author__ = 'Jerry Clapperton' - description = 'Montreal news in English' - language = 'en_CA' - - oldest_article = 7 - max_articles_per_feed = 20 - use_embedded_content = False - remove_javascript = True - no_stylesheets = True - encoding = 'utf-8' - - keep_only_tags = [dict(name='div', attrs={'id':['storyheader','page1']})] - - extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' - - feeds = [(u'News', u'http://feeds.canada.com/canwest/F297'), (u'Opinion', u'http://feeds.canada.com/canwest/F7383'), (u'Arts', u'http://feeds.canada.com/canwest/F7366'), (u'Life', u'http://rss.canada.com/get/?F6934'), (u'Business', u'http://feeds.canada.com/canwest/F6939'), (u'Travel', u'http://rss.canada.com/get/?F6938'), (u'Health', u'http://feeds.canada.com/canwest/F7397'), (u'Technology', u'http://feeds.canada.com/canwest/F7411')] diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index b190f43849..e29bfe3dde 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -215,7 +215,7 @@ class WSJ(BasicNewsRecipe): # first, check if there is an h3 tag which provides a section name stag = divtag.find('h3') if stag: - if stag.parent['class'] == 'dynamic': + if stag.parent.get('class', '') == 'dynamic': # a carousel of articles is too complex to extract a section name # for each article, so we'll just call the section "Carousel" section_name = 'Carousel' diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 9f98147032..552af1590f 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -262,7 +262,6 @@ class Region(object): max_lines = max(max_lines, len(c)) return max_lines - @property def is_small(self): return self.line_count < 3 @@ -438,9 +437,8 @@ class Page(object): # absorb into a neighboring region (prefer the one with number of cols # closer to the avg number of cols in the set, if equal use larger # region) - # merge contiguous regions that can contain each other - '''absorbed = set([]) found = True + absorbed = set([]) while found: found = False for i, region in enumerate(self.regions): @@ -452,10 +450,33 @@ class Page(object): regions.append(self.regions[j]) else: break - prev = None if i == 0 else i-1 - next = j if self.regions[j] not in regions else None - ''' - pass + prev_region = None if i == 0 else i-1 + next_region = j if self.regions[j] not in regions else None + if prev_region is None and next_region is not None: + absorb_into = next_region + elif next_region is None and prev_region is not None: + absorb_into = prev_region + elif prev_region is None and next_region is None: + if len(regions) > 1: + absorb_into = regions[0] + regions = regions[1:] + else: + absorb_into = None + else: + absorb_into = prev_region + if next_region.line_count >= prev_region.line_count: + avg_column_count = sum([len(r.columns) for r in + regions])/float(len(regions)) + if next_region.line_count > prev_region.line_count \ + or abs(avg_column_count - len(prev_region.columns)) \ + > abs(avg_column_count - len(next_region.columns)): + absorb_into = next_region + if absorb_into is not None: + absorb_into.absorb_region(regions) + absorbed.update(regions) + i = j + for region in absorbed: + self.regions.remove(region)