From 5e93ea1da26da5594c0a5a5853d6023fb220ea88 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 28 Jan 2010 09:43:14 -0700
Subject: [PATCH 1/7] Conversion pipeline: If tag is not under
move it to the correct place. LIT Input: Strip embedded and
elements. Fixes #4712 (Unable to convert .rtf and .lit files to
.EPUB)
---
src/calibre/ebooks/lit/input.py | 5 +
src/calibre/ebooks/oeb/base.py | 12 ++-
src/calibre/ebooks/pdf/reflow.py | 158 +++++++++++++++++++++++++++++--
src/calibre/ebooks/rtf/input.py | 4 +-
4 files changed, 166 insertions(+), 13 deletions(-)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 8655d8b189..89873196c9 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -26,6 +26,11 @@ class LITInput(InputFormatPlugin):
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'): continue
+ for bad in ('metadata', 'guide'):
+ metadata = XPath('//h:'+bad)(root)
+ if metadata:
+ for x in metadata:
+ x.getparent().remove(x)
body = XPath('//h:body')(root)
if body:
body = body[0]
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 18d3de1e56..c93a0689b2 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -909,9 +909,15 @@ class Manifest(object):
'content': '%s; charset=utf-8' % XHTML_NS})
# Ensure has a
if not xpath(data, '/h:html/h:body'):
- self.oeb.logger.warn(
- 'File %r missing element' % self.href)
- etree.SubElement(data, XHTML('body'))
+ body = xpath(data, '//h:body')
+ if body:
+ body = body[0]
+ body.getparent().remove(body)
+ data.append(body)
+ else:
+ self.oeb.logger.warn(
+ 'File %r missing element' % self.href)
+ etree.SubElement(data, XHTML('body'))
# Remove microsoft office markup
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 80cfc0bb30..bf2d921a10 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -43,6 +43,10 @@ class Image(Element):
self.bottom = self.top + self.height
self.right = self.left + self.width
+ def to_html(self):
+ return '
' % \
+ (self.src, int(self.width), int(self.height))
+
class Text(Element):
@@ -66,8 +70,6 @@ class Text(Element):
self.raw = text.text if text.text else u''
for x in text.iterchildren():
self.raw += etree.tostring(x, method='xml', encoding=unicode)
- if x.tail:
- self.raw += x.tail
self.average_character_width = self.width/len(self.text_as_string)
def coalesce(self, other, page_number):
@@ -86,6 +88,9 @@ class Text(Element):
self.average_character_width = (self.average_character_width +
other.average_character_width)/2.0
+ def to_html(self):
+ return self.raw
+
class FontSizeStats(dict):
def __init__(self, stats):
@@ -108,6 +113,11 @@ class Interval(object):
right = min(self.right, other.right)
return Interval(left, right)
+ def centered_in(self, parent):
+ left = abs(self.left - parent.left)
+ right = abs(self.right - parent.right)
+ return abs(left-right) < 3
+
def __nonzero__(self):
return self.width > 0
@@ -146,6 +156,9 @@ class Column(object):
for x in self.elements:
yield x
+ def __len__(self):
+ return len(self.elements)
+
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
@@ -174,17 +187,42 @@ class Column(object):
class Box(list):
def __init__(self, type='p'):
- self.type = type
+ self.tag = type
+
+ def to_html(self):
+ ans = ['<%s>'%self.tag]
+ for elem in self:
+ if isinstance(elem, int):
+ ans.append(''%elem)
+ else:
+ ans.append(elem.to_html()+' ')
+ ans.append('%s>'%self.tag)
+ return ans
class ImageBox(Box):
def __init__(self, img):
- Box.__init__(self, type='img')
+ Box.__init__(self)
self.img = img
+ def to_html(self):
+ ans = ['
']
+ ans.append(self.img.to_html())
+ if len(self) > 0:
+ ans.append('
')
+ for elem in self:
+ if isinstance(elem, int):
+ ans.append('
'%elem)
+ else:
+ ans.append(elem.to_html()+' ')
+ ans.append('
')
+ return ans
+
+
class Region(object):
- def __init__(self):
+ def __init__(self, opts, log):
+ self.opts, self.log = opts, log
self.columns = []
self.top = self.bottom = self.left = self.right = self.width = self.height = 0
@@ -217,6 +255,40 @@ class Region(object):
def is_empty(self):
return len(self.columns) == 0
+ @property
+ def is_small(self):
+ max_lines = 0
+ for c in self.columns:
+ max_lines = max(max_lines, len(c))
+ return max_lines > 2
+
+ def absorb(self, singleton):
+
+ def most_suitable_column(elem):
+ mc, mw = None, 0
+ for c in self.columns:
+ i = Interval(c.left, c.right)
+ e = Interval(elem.left, elem.right)
+ w = i.intersection(e).width
+ if w > mw:
+ mc, mw = c, w
+ if mc is None:
+ self.log.warn('No suitable column for singleton',
+ elem.to_html())
+ mc = self.columns[0]
+ return mc
+
+ print
+ for c in singleton.columns:
+ for elem in c:
+ col = most_suitable_column(elem)
+ if self.opts.verbose > 3:
+ idx = self.columns.index(col)
+ self.log.debug(u'Absorbing singleton %s into column'%elem.to_html(),
+ idx)
+ col.add(elem)
+
+
def collect_stats(self):
for column in self.columns:
column.collect_stats()
@@ -231,7 +303,6 @@ class Region(object):
self.elements = []
for x in self.columns:
self.elements.extend(x)
-
self.boxes = [Box()]
for i, elem in enumerate(self.elements):
if isinstance(elem, Image):
@@ -341,7 +412,7 @@ class Page(object):
return
for i, x in enumerate(self.elements):
x.idx = i
- current_region = Region()
+ current_region = Region(self.opts, self.log)
processed = set([])
for x in self.elements:
if x in processed: continue
@@ -350,12 +421,42 @@ class Page(object):
processed.update(elems)
if not current_region.contains(columns):
self.regions.append(current_region)
- current_region = Region()
+ current_region = Region(self.opts, self.log)
current_region.add(columns)
if not current_region.is_empty:
self.regions.append(current_region)
+ self.coalesce_regions()
+
+ def coalesce_regions(self):
+ # find contiguous sets of small regions
+ # absorb into a neighboring region (prefer the one with number of cols
+ # closer to the avg number of cols in the set, if equal use large
+ # region)
+ # merge contiguous regions that can contain each other
+ absorbed = set([])
+ found = True
+ while found:
+ found = False
+ for i, region in enumerate(self.regions):
+ if region.is_small:
+ found = True
+ regions = []
+ for j in range(i+1, len(self.regions)):
+ if self.regions[j].is_small:
+ regions.append(self.regions[j])
+ else:
+ break
+ prev = None if i == 0 else i-1
+ next = j if self.regions[j] not in regions else None
+
+
+
def sort_into_columns(self, elem, neighbors):
+ neighbors.add(elem)
+ neighbors = sorted(neighbors, cmp=lambda x,y:cmp(x.left, y.left))
+ if self.opts.verbose > 3:
+ self.log.debug('Neighbors:', [x.to_html() for x in neighbors])
columns = [Column()]
columns[0].add(elem)
for x in neighbors:
@@ -421,6 +522,9 @@ class PDFDocument(object):
page.first_pass()
page.second_pass()
+ self.linearize()
+ self.render()
+
def collect_font_statistics(self):
self.font_size_stats = {}
for p in self.pages:
@@ -432,5 +536,43 @@ class PDFDocument(object):
self.font_size_stats = FontSizeStats(self.font_size_stats)
+ def linearize(self):
+ self.elements = []
+ last_region = last_block = None
+ for page in self.pages:
+ page_number_inserted = False
+ for region in page.regions:
+ merge_first_block = last_region is not None and \
+ len(last_region.columns) == len(region.columns) and \
+ not hasattr(last_block, 'img')
+ for i, block in enumerate(region.boxes):
+ if merge_first_block:
+ merge_first_block = False
+ if not page_number_inserted:
+ last_block.append(page.number)
+ page_number_inserted = True
+ for elem in block:
+ last_block.append(elem)
+ else:
+ if not page_number_inserted:
+ block.insert(0, page.number)
+ page_number_inserted = True
+ self.elements.append(block)
+ last_block = block
+ last_region = region
+
+
+ def render(self):
+ html = ['',
+ '', '',
+ 'PDF Reflow conversion', '', '',
+ '']
+ for elem in self.elements:
+ html.extend(elem.to_html())
+ html += ['', '']
+ with open('index.html', 'wb') as f:
+ f.write((u'\n'.join(html)).encode('utf-8'))
+
+
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index ff20793f39..d5e1a95157 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -195,9 +195,9 @@ class RTFInput(InputFormatPlugin):
fname = self.preprocess(stream.name)
try:
xml = self.generate_xml(fname)
- except RtfInvalidCodeException:
+ except RtfInvalidCodeException, e:
raise ValueError(_('This RTF file has a feature calibre does not '
- 'support. Convert it to HTML first and then try it.'))
+ 'support. Convert it to HTML first and then try it.\n%s')%e)
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}
From b259de7497751624b89d596bc64632d3390f014b Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 28 Jan 2010 09:47:54 -0700
Subject: [PATCH 2/7] Fix #4710 (new author_sort[0] for sending to
device/saving causes error)
---
src/calibre/gui2/dialogs/config/save_template.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/gui2/dialogs/config/save_template.py b/src/calibre/gui2/dialogs/config/save_template.py
index 8fe36c430f..71eb15f4aa 100644
--- a/src/calibre/gui2/dialogs/config/save_template.py
+++ b/src/calibre/gui2/dialogs/config/save_template.py
@@ -37,7 +37,7 @@ class SaveTemplate(QWidget, Ui_Form):
tmpl = preprocess_template(self.opt_template.text())
fa = {}
for x in FORMAT_ARG_DESCS.keys():
- fa[x]=''
+ fa[x]='random long string'
try:
tmpl.format(**fa)
except Exception, err:
From 356d66482a9b33ddb781ac92c6b1472ff204fb19 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 28 Jan 2010 10:04:54 -0700
Subject: [PATCH 3/7] Fix regression that broke the NYT recipe
---
src/calibre/web/feeds/news.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 9abb55852e..fff8bbb4e8 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -428,7 +428,9 @@ class BasicNewsRecipe(Recipe):
else:
_raw = _raw.decode(self.encoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
- massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
+ enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
+ massage.append((re.compile(r'&(\S+?);'), lambda match:
+ entity_to_unicode(match, encoding=enc)))
return BeautifulSoup(_raw, markupMassage=massage)
From affc72895b12e59417229711598e1732bb05caa1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 28 Jan 2010 10:19:21 -0700
Subject: [PATCH 4/7] Improved NYT recipes
---
resources/recipes/nytimes.recipe | 31 ++++++++++++++++++++------
resources/recipes/nytimes_sub.recipe | 33 ++++++++++++++++++++++++++--
2 files changed, 55 insertions(+), 9 deletions(-)
diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe
index 420d4b78ad..8b9283a0af 100644
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@@ -79,13 +79,30 @@ class NYTimes(BasicNewsRecipe):
.authorId {text-align: left; \
font-style: italic;}\n '
-# def get_cover_url(self):
-# st = time.localtime()
-# year = str(st.tm_year)
-# month = "%.2d" % st.tm_mon
-# day = "%.2d" % st.tm_mday
-# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
-# return cover
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nCover unavailable")
+ cover = None
+ return cover
+
+ def get_masthead_url(self):
+ masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(masthead)
+ except:
+ self.log("\nCover unavailable")
+ masthead = None
+ return masthead
def get_browser(self):
br = BasicNewsRecipe.get_browser()
diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index e07560c554..e3942469a4 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal '
'''
nytimes.com
'''
-import string, re
+import string, re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@@ -31,7 +31,8 @@ class NYTimes(BasicNewsRecipe):
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline',
'navigation', 'archive', 'side_search', 'blog_sidebar',
- 'side_tool', 'side_index',
+ 'side_tool', 'side_index', 'login', 'businessSearchBar',
+ 'adxLeaderboard',
'relatedArticles', 'relatedTopics', 'adxSponLink']),
dict(name=['script', 'noscript', 'style'])]
encoding = decode
@@ -51,11 +52,39 @@ class NYTimes(BasicNewsRecipe):
#open('/t/log.html', 'wb').write(raw)
return br
+ def get_masthead_url(self):
+ masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(masthead)
+ except:
+ self.log("\nCover unavailable")
+ masthead = None
+ return masthead
+
+
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nCover unavailable")
+ cover = None
+ return cover
+
def short_title(self):
return 'NY Times'
def parse_index(self):
+ self.encoding = 'cp1252'
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
+ self.encoding = decode
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
From c9f96840cc48aabf222cb038a9aa836f2c740570 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 28 Jan 2010 10:46:42 -0700
Subject: [PATCH 5/7] ...
---
resources/recipes/economist_free.recipe | 2 ++
1 file changed, 2 insertions(+)
diff --git a/resources/recipes/economist_free.recipe b/resources/recipes/economist_free.recipe
index 217b033b81..0a98c7da28 100644
--- a/resources/recipes/economist_free.recipe
+++ b/resources/recipes/economist_free.recipe
@@ -53,6 +53,8 @@ class Economist(BasicNewsRecipe):
self.feed_dict.items()])
def eco_sort_sections(self, feeds):
+ if not feeds:
+ raise ValueError('No new articles found')
order = {
'The World This Week': 1,
'Leaders': 2,
From 5cc1c8ee65540140b071510f8a9d529f33964a1d Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 28 Jan 2010 12:05:23 -0700
Subject: [PATCH 6/7] ImageMagick wrapper: Add convenience method to remove
transparency from images
---
src/calibre/utils/PythonMagickWand.py | 41 +++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/src/calibre/utils/PythonMagickWand.py b/src/calibre/utils/PythonMagickWand.py
index 9920334b0a..20f503bc22 100644
--- a/src/calibre/utils/PythonMagickWand.py
+++ b/src/calibre/utils/PythonMagickWand.py
@@ -122,6 +122,21 @@ class ImageMagick(object):
def __exit__(self, *args):
finalize()
+def remove_transparency(wand, background_color='white'):
+ '''
+ Converts transparent pixels to the specified background color.
+ Returns a new magick wand with the opaque image.
+ '''
+ nw = NewMagickWand()
+ pw = NewPixelWand()
+ if nw < 0 or pw < 0:
+ raise RuntimeError('Out of memory')
+ PixelSetColor(pw, background_color)
+ MagickNewImage(nw, MagickGetImageWidth(wand), MagickGetImageHeight(wand),
+ pw)
+ MagickCompositeImage(nw, wand, OverCompositeOp, 0, 0)
+ DestroyPixelWand(pw)
+ return nw
class MetricType(ctypes.c_int): pass
UndefinedMetric = MetricType(0)
@@ -730,6 +745,32 @@ class MagickStatusType(ctypes.c_void_p): pass
class MagickInfo(ctypes.c_void_p): pass
class MagickWand(ctypes.c_void_p): pass
+# NewPixelWand
+try:
+ _magick.NewPixelWand.restype = PixelWand
+except:
+ pass
+else:
+ NewPixelWand = _magick.NewPixelWand
+
+# MagickSetImageOpacity
+try:
+ _magick.MagickSetImageOpacity.argtypes = (MagickWand, ctypes.c_double)
+ _magick.restype = MagickBooleanType
+except:
+ pass
+else:
+ MagickSetImageOpacity = _magick.MagickSetImageOpacity
+
+# MagickMergeImageLayers
+try:
+ _magick.MagickMergeImageLayers.argtypes = (MagickWand, ImageLayerMethod)
+ _magick.MagickMergeImageLayers.restype = MagickWand
+except:
+ pass
+else:
+ MagickMergeImageLayers = _magick.MagickMergeImageLayers
+
# MagickSetLastIterator
try:
_magick.MagickSetLastIterator.restype = None
From 98b7f9a8fe4bded4a3fb34259abe0eb52fc5b84c Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 28 Jan 2010 12:47:51 -0700
Subject: [PATCH 7/7] Add support for masthead images to BasicNewsRecipe
---
src/calibre/web/feeds/news.py | 66 +++++++++++++++++++++++++++++++++--
1 file changed, 64 insertions(+), 2 deletions(-)
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index fff8bbb4e8..e76d8aaa8f 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -861,8 +861,6 @@ class BasicNewsRecipe(Recipe):
self.log.exception('Failed to download cover')
self.cover_path = None
-
-
def default_cover(self, cover_file):
'''
Create a generic cover for recipes that dont have a cover
@@ -926,6 +924,70 @@ class BasicNewsRecipe(Recipe):
cover_file.flush()
return True
+ def get_masthead_title(self):
+ 'Override in subclass to use something other than the recipe title'
+ return self.title
+
+ def default_masthead_image(self, out_path):
+ try:
+ from PIL import Image, ImageDraw, ImageFont
+ Image, ImageDraw, ImageFont
+ except ImportError:
+ import Image, ImageDraw, ImageFont
+
+
+ img = Image.new('RGB', (600, 100), 'white')
+ draw = ImageDraw.Draw(img)
+ font = ImageFont.truetype(P('fonts/liberation/LiberationSerif-Bold.ttf'), 48)
+ text = self.get_masthead_title().encode('utf-8')
+ width, height = draw.textsize(text, font=font)
+ left = max(int((600 - width)/2.), 0)
+ top = max(int((100 - height)/2.), 0)
+ draw.text((left, top), text, fill=(0,0,0), font=font)
+ img.save(open(out_path, 'wb'), 'JPEG')
+
+ def prepare_masthead_image(self, path_to_image, out_path):
+ import calibre.utils.PythonMagickWand as pw
+ from ctypes import byref
+ from calibre import fit_image
+
+ with pw.ImageMagick():
+ img = pw.NewMagickWand()
+ img2 = pw.NewMagickWand()
+ frame = pw.NewMagickWand()
+ p = pw.NewPixelWand()
+ if img < 0 or img2 < 0 or p < 0 or frame < 0:
+ raise RuntimeError('Out of memory')
+ if not pw.MagickReadImage(img, path_to_image):
+ severity = pw.ExceptionType(0)
+ msg = pw.MagickGetException(img, byref(severity))
+ raise IOError('Failed to read image from: %s: %s'
+ %(path_to_image, msg))
+ pw.PixelSetColor(p, 'white')
+ width, height = pw.MagickGetImageWidth(img),pw.MagickGetImageHeight(img)[1:]
+ scaled, nwidth, nheight = fit_image(width, height, 600, 100)
+ if not pw.MagickNewImage(img2, width, height, p):
+ raise RuntimeError('Out of memory')
+ if not pw.MagickNewImage(frame, 600, 100, p):
+ raise RuntimeError('Out of memory')
+ if not pw.MagickCompositeImage(img2, img, pw.OverCompositeOp, 0, 0):
+ raise RuntimeError('Out of memory')
+ if scaled:
+ if not pw.MagickResizeImage(img2, nwidth, nheight, pw.LanczosFilter,
+ 0.5):
+ raise RuntimeError('Out of memory')
+ left = int((600 - nwidth)/2.0)
+ top = int((100 - nheight)/2.0)
+ if not pw.MagickCompositeImage(frame, img2, pw.OverCompositeOp,
+ left, top):
+ raise RuntimeError('Out of memory')
+ if not pw.MagickWriteImage(frame, out_path):
+ raise RuntimeError('Failed to save image to %s'%out_path)
+
+ pw.DestroyPixelWand(p)
+ for x in (img, img2, frame):
+ pw.DestroyMagickWand(x)
+
def create_opf(self, feeds, dir=None):
if dir is None: