diff --git a/resources/recipes/economist_free.recipe b/resources/recipes/economist_free.recipe
index 217b033b81..0a98c7da28 100644
--- a/resources/recipes/economist_free.recipe
+++ b/resources/recipes/economist_free.recipe
@@ -53,6 +53,8 @@ class Economist(BasicNewsRecipe):
self.feed_dict.items()])
def eco_sort_sections(self, feeds):
+ if not feeds:
+ raise ValueError('No new articles found')
order = {
'The World This Week': 1,
'Leaders': 2,
diff --git a/resources/recipes/kidney.recipe b/resources/recipes/kidney.recipe
index cc37954ab3..19fd244675 100644
--- a/resources/recipes/kidney.recipe
+++ b/resources/recipes/kidney.recipe
@@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
+
+import time
+
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@@ -8,6 +11,7 @@ class JASN(BasicNewsRecipe):
__author__ = 'Krittika Goyal'
oldest_article = 31 #days
max_articles_per_feed = 25
+ delay = 5
needs_subscription = True
INDEX = 'http://jasn.asnjournals.org/current.shtml'
@@ -102,9 +106,17 @@ class JASN(BasicNewsRecipe):
continue
if url.startswith('/'):
url = 'http://jasn.asnjournals.org'+url
- isoup = self.index_to_soup(url)
- img = isoup.find('img', src=lambda x: x and
- x.startswith('/content/'))
+ img = isoup = None
+ try:
+ isoup = self.index_to_soup(url)
+ except:
+ time.sleep(5)
+ try:
+ isoup = self.index_to_soup(url)
+ except:
+ continue
+ img = isoup.find('img', src=lambda x: x and x.startswith('/content/'))
+
if img is not None:
img.extract()
table = a.findParent('table')
diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe
index 420d4b78ad..8b9283a0af 100644
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@@ -79,13 +79,30 @@ class NYTimes(BasicNewsRecipe):
.authorId {text-align: left; \
font-style: italic;}\n '
-# def get_cover_url(self):
-# st = time.localtime()
-# year = str(st.tm_year)
-# month = "%.2d" % st.tm_mon
-# day = "%.2d" % st.tm_mday
-# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
-# return cover
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nCover unavailable")
+ cover = None
+ return cover
+
+ def get_masthead_url(self):
+ masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(masthead)
+ except:
+ self.log("\nCover unavailable")
+ masthead = None
+ return masthead
def get_browser(self):
br = BasicNewsRecipe.get_browser()
diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index e07560c554..e3942469a4 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal
element' % self.href) - etree.SubElement(data, XHTML('body')) + body = xpath(data, '//h:body') + if body: + body = body[0] + body.getparent().remove(body) + data.append(body) + else: + self.oeb.logger.warn( + 'File %r missing
element' % self.href)
+ etree.SubElement(data, XHTML('body'))
# Remove microsoft office markup
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 8203b49dde..bf2d921a10 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -43,6 +43,10 @@ class Image(Element):
self.bottom = self.top + self.height
self.right = self.left + self.width
+ def to_html(self):
+ return '' % \
+ (self.src, int(self.width), int(self.height))
+
class Text(Element):
@@ -66,8 +70,6 @@ class Text(Element):
self.raw = text.text if text.text else u''
for x in text.iterchildren():
self.raw += etree.tostring(x, method='xml', encoding=unicode)
- if x.tail:
- self.raw += x.tail
self.average_character_width = self.width/len(self.text_as_string)
def coalesce(self, other, page_number):
@@ -86,6 +88,9 @@ class Text(Element):
self.average_character_width = (self.average_character_width +
other.average_character_width)/2.0
+ def to_html(self):
+ return self.raw
+
class FontSizeStats(dict):
def __init__(self, stats):
@@ -108,6 +113,11 @@ class Interval(object):
right = min(self.right, other.right)
return Interval(left, right)
+ def centered_in(self, parent):
+ left = abs(self.left - parent.left)
+ right = abs(self.right - parent.right)
+ return abs(left-right) < 3
+
def __nonzero__(self):
return self.width > 0
@@ -146,6 +156,9 @@ class Column(object):
for x in self.elements:
yield x
+ def __len__(self):
+ return len(self.elements)
+
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
@@ -160,9 +173,10 @@ class Column(object):
elem.indent_fraction = left_margin/self.width
elem.width_fraction = elem.width/self.width
if i == 0:
- elem.top_gap = None
+ elem.top_gap_ratio = None
else:
- elem.top_gap = self.elements[i-1].bottom - elem.top
+ elem.top_gap_ratio = (self.elements[i-1].bottom -
+ elem.top)/self.average_line_separation
def previous_element(self, idx):
if idx == 0:
@@ -173,12 +187,42 @@ class Column(object):
class Box(list):
def __init__(self, type='p'):
- self.type = type
+ self.tag = type
+
+ def to_html(self):
+ ans = ['<%s>'%self.tag]
+ for elem in self:
+ if isinstance(elem, int):
+ ans.append(''%elem)
+ else:
+ ans.append(elem.to_html()+' ')
+ ans.append('%s>'%self.tag)
+ return ans
+
+class ImageBox(Box):
+
+ def __init__(self, img):
+ Box.__init__(self)
+ self.img = img
+
+ def to_html(self):
+ ans = ['
') + return ans class Region(object): - def __init__(self): + def __init__(self, opts, log): + self.opts, self.log = opts, log self.columns = [] self.top = self.bottom = self.left = self.right = self.width = self.height = 0 @@ -211,6 +255,40 @@ class Region(object): def is_empty(self): return len(self.columns) == 0 + @property + def is_small(self): + max_lines = 0 + for c in self.columns: + max_lines = max(max_lines, len(c)) + return max_lines > 2 + + def absorb(self, singleton): + + def most_suitable_column(elem): + mc, mw = None, 0 + for c in self.columns: + i = Interval(c.left, c.right) + e = Interval(elem.left, elem.right) + w = i.intersection(e).width + if w > mw: + mc, mw = c, w + if mc is None: + self.log.warn('No suitable column for singleton', + elem.to_html()) + mc = self.columns[0] + return mc + + print + for c in singleton.columns: + for elem in c: + col = most_suitable_column(elem) + if self.opts.verbose > 3: + idx = self.columns.index(col) + self.log.debug(u'Absorbing singleton %s into column'%elem.to_html(), + idx) + col.add(elem) + + def collect_stats(self): for column in self.columns: column.collect_stats() @@ -225,9 +303,30 @@ class Region(object): self.elements = [] for x in self.columns: self.elements.extend(x) - - # Find block quotes - indented = [i for (i, x) in enumerate(self.elements) if x.indent_fraction >= 0.2] + self.boxes = [Box()] + for i, elem in enumerate(self.elements): + if isinstance(elem, Image): + self.boxes.append(ImageBox(elem)) + img = Interval(elem.left, elem.right) + for j in range(i+1, len(self.elements)): + t = self.elements[j] + if not isinstance(t, Text): + break + ti = Interval(t.left, t.right) + if not ti.centered_in(img): + break + self.boxes[-1].append(t) + self.boxes.append(Box()) + else: + is_indented = False + if i+1 < len(self.elements): + indent_diff = elem.indent_fraction - \ + self.elements[i+1].indent_fraction + if indent_diff > 0.05: + is_indented = True + if elem.top_gap_ratio > 1.2 or is_indented: + self.boxes.append(Box()) + self.boxes[-1].append(elem) @@ -313,7 +412,7 @@ class Page(object): return for i, x in enumerate(self.elements): x.idx = i - current_region = Region() + current_region = Region(self.opts, self.log) processed = set([]) for x in self.elements: if x in processed: continue @@ -322,12 +421,42 @@ class Page(object): processed.update(elems) if not current_region.contains(columns): self.regions.append(current_region) - current_region = Region() + current_region = Region(self.opts, self.log) current_region.add(columns) if not current_region.is_empty: self.regions.append(current_region) + self.coalesce_regions() + + def coalesce_regions(self): + # find contiguous sets of small regions + # absorb into a neighboring region (prefer the one with number of cols + # closer to the avg number of cols in the set, if equal use large + # region) + # merge contiguous regions that can contain each other + absorbed = set([]) + found = True + while found: + found = False + for i, region in enumerate(self.regions): + if region.is_small: + found = True + regions = [] + for j in range(i+1, len(self.regions)): + if self.regions[j].is_small: + regions.append(self.regions[j]) + else: + break + prev = None if i == 0 else i-1 + next = j if self.regions[j] not in regions else None + + + def sort_into_columns(self, elem, neighbors): + neighbors.add(elem) + neighbors = sorted(neighbors, cmp=lambda x,y:cmp(x.left, y.left)) + if self.opts.verbose > 3: + self.log.debug('Neighbors:', [x.to_html() for x in neighbors]) columns = [Column()] columns[0].add(elem) for x in neighbors: @@ -393,6 +522,9 @@ class PDFDocument(object): page.first_pass() page.second_pass() + self.linearize() + self.render() + def collect_font_statistics(self): self.font_size_stats = {} for p in self.pages: @@ -404,5 +536,43 @@ class PDFDocument(object): self.font_size_stats = FontSizeStats(self.font_size_stats) + def linearize(self): + self.elements = [] + last_region = last_block = None + for page in self.pages: + page_number_inserted = False + for region in page.regions: + merge_first_block = last_region is not None and \ + len(last_region.columns) == len(region.columns) and \ + not hasattr(last_block, 'img') + for i, block in enumerate(region.boxes): + if merge_first_block: + merge_first_block = False + if not page_number_inserted: + last_block.append(page.number) + page_number_inserted = True + for elem in block: + last_block.append(elem) + else: + if not page_number_inserted: + block.insert(0, page.number) + page_number_inserted = True + self.elements.append(block) + last_block = block + last_region = region + + + def render(self): + html = ['', + '', '
', + '
', '', '