Integrated masthead code with KG revisions

This commit is contained in:
GRiker 2010-01-28 15:05:49 -07:00
commit be6fbbab96
12 changed files with 395 additions and 38 deletions

View File

@ -53,6 +53,8 @@ class Economist(BasicNewsRecipe):
self.feed_dict.items()])
def eco_sort_sections(self, feeds):
if not feeds:
raise ValueError('No new articles found')
order = {
'The World This Week': 1,
'Leaders': 2,

View File

@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
import time
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -8,6 +11,7 @@ class JASN(BasicNewsRecipe):
__author__ = 'Krittika Goyal'
oldest_article = 31 #days
max_articles_per_feed = 25
delay = 5
needs_subscription = True
INDEX = 'http://jasn.asnjournals.org/current.shtml'
@ -102,9 +106,17 @@ class JASN(BasicNewsRecipe):
continue
if url.startswith('/'):
url = 'http://jasn.asnjournals.org'+url
isoup = self.index_to_soup(url)
img = isoup.find('img', src=lambda x: x and
x.startswith('/content/'))
img = isoup = None
try:
isoup = self.index_to_soup(url)
except:
time.sleep(5)
try:
isoup = self.index_to_soup(url)
except:
continue
img = isoup.find('img', src=lambda x: x and x.startswith('/content/'))
if img is not None:
img.extract()
table = a.findParent('table')

View File

@ -79,13 +79,30 @@ class NYTimes(BasicNewsRecipe):
.authorId {text-align: left; \
font-style: italic;}\n '
# def get_cover_url(self):
# st = time.localtime()
# year = str(st.tm_year)
# month = "%.2d" % st.tm_mon
# day = "%.2d" % st.tm_mday
# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
# return cover
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nCover unavailable")
cover = None
return cover
def get_masthead_url(self):
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead
def get_browser(self):
br = BasicNewsRecipe.get_browser()

View File

@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
'''
import string, re
import string, re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -31,7 +31,8 @@ class NYTimes(BasicNewsRecipe):
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline',
'navigation', 'archive', 'side_search', 'blog_sidebar',
'side_tool', 'side_index',
'side_tool', 'side_index', 'login', 'businessSearchBar',
'adxLeaderboard',
'relatedArticles', 'relatedTopics', 'adxSponLink']),
dict(name=['script', 'noscript', 'style'])]
encoding = decode
@ -51,11 +52,39 @@ class NYTimes(BasicNewsRecipe):
#open('/t/log.html', 'wb').write(raw)
return br
def get_masthead_url(self):
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nCover unavailable")
cover = None
return cover
def short_title(self):
return 'NY Times'
def parse_index(self):
self.encoding = 'cp1252'
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
self.encoding = decode
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()

View File

@ -71,7 +71,7 @@ int do_mount(const char *dev, const char *mp) {
#ifdef __NetBSD__
execlp("mount_msdos", "mount_msdos", "-u", uids, "-g", gids, "-o", options, dev, mp, NULL);
#else
execlp("mount", "mount", "-t", "vfat", "-o", options, dev, mp, NULL);
execlp("mount", "mount", "-t", "auto", "-o", options, dev, mp, NULL);
#endif
errsv = errno;
fprintf(stderr, "Failed to mount with error: %s\n", strerror(errsv));

View File

@ -26,6 +26,11 @@ class LITInput(InputFormatPlugin):
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'): continue
for bad in ('metadata', 'guide'):
metadata = XPath('//h:'+bad)(root)
if metadata:
for x in metadata:
x.getparent().remove(x)
body = XPath('//h:body')(root)
if body:
body = body[0]

View File

@ -909,9 +909,15 @@ class Manifest(object):
'content': '%s; charset=utf-8' % XHTML_NS})
# Ensure has a <body/>
if not xpath(data, '/h:html/h:body'):
self.oeb.logger.warn(
'File %r missing <body/> element' % self.href)
etree.SubElement(data, XHTML('body'))
body = xpath(data, '//h:body')
if body:
body = body[0]
body.getparent().remove(body)
data.append(body)
else:
self.oeb.logger.warn(
'File %r missing <body/> element' % self.href)
etree.SubElement(data, XHTML('body'))
# Remove microsoft office markup
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]

View File

@ -43,6 +43,10 @@ class Image(Element):
self.bottom = self.top + self.height
self.right = self.left + self.width
def to_html(self):
return '<img src="%s" width="%dpx" height="%dpx"/>' % \
(self.src, int(self.width), int(self.height))
class Text(Element):
@ -66,8 +70,6 @@ class Text(Element):
self.raw = text.text if text.text else u''
for x in text.iterchildren():
self.raw += etree.tostring(x, method='xml', encoding=unicode)
if x.tail:
self.raw += x.tail
self.average_character_width = self.width/len(self.text_as_string)
def coalesce(self, other, page_number):
@ -86,6 +88,9 @@ class Text(Element):
self.average_character_width = (self.average_character_width +
other.average_character_width)/2.0
def to_html(self):
return self.raw
class FontSizeStats(dict):
def __init__(self, stats):
@ -108,6 +113,11 @@ class Interval(object):
right = min(self.right, other.right)
return Interval(left, right)
def centered_in(self, parent):
left = abs(self.left - parent.left)
right = abs(self.right - parent.right)
return abs(left-right) < 3
def __nonzero__(self):
return self.width > 0
@ -146,6 +156,9 @@ class Column(object):
for x in self.elements:
yield x
def __len__(self):
return len(self.elements)
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
@ -160,9 +173,10 @@ class Column(object):
elem.indent_fraction = left_margin/self.width
elem.width_fraction = elem.width/self.width
if i == 0:
elem.top_gap = None
elem.top_gap_ratio = None
else:
elem.top_gap = self.elements[i-1].bottom - elem.top
elem.top_gap_ratio = (self.elements[i-1].bottom -
elem.top)/self.average_line_separation
def previous_element(self, idx):
if idx == 0:
@ -173,12 +187,42 @@ class Column(object):
class Box(list):
def __init__(self, type='p'):
self.type = type
self.tag = type
def to_html(self):
ans = ['<%s>'%self.tag]
for elem in self:
if isinstance(elem, int):
ans.append('<a name="page_%d"/>'%elem)
else:
ans.append(elem.to_html()+' ')
ans.append('</%s>'%self.tag)
return ans
class ImageBox(Box):
def __init__(self, img):
Box.__init__(self)
self.img = img
def to_html(self):
ans = ['<div style="text-align:center">']
ans.append(self.img.to_html())
if len(self) > 0:
ans.append('<br/>')
for elem in self:
if isinstance(elem, int):
ans.append('<a name="page_%d"/>'%elem)
else:
ans.append(elem.to_html()+' ')
ans.append('</div>')
return ans
class Region(object):
def __init__(self):
def __init__(self, opts, log):
self.opts, self.log = opts, log
self.columns = []
self.top = self.bottom = self.left = self.right = self.width = self.height = 0
@ -211,6 +255,40 @@ class Region(object):
def is_empty(self):
return len(self.columns) == 0
@property
def is_small(self):
max_lines = 0
for c in self.columns:
max_lines = max(max_lines, len(c))
return max_lines > 2
def absorb(self, singleton):
def most_suitable_column(elem):
mc, mw = None, 0
for c in self.columns:
i = Interval(c.left, c.right)
e = Interval(elem.left, elem.right)
w = i.intersection(e).width
if w > mw:
mc, mw = c, w
if mc is None:
self.log.warn('No suitable column for singleton',
elem.to_html())
mc = self.columns[0]
return mc
print
for c in singleton.columns:
for elem in c:
col = most_suitable_column(elem)
if self.opts.verbose > 3:
idx = self.columns.index(col)
self.log.debug(u'Absorbing singleton %s into column'%elem.to_html(),
idx)
col.add(elem)
def collect_stats(self):
for column in self.columns:
column.collect_stats()
@ -225,9 +303,30 @@ class Region(object):
self.elements = []
for x in self.columns:
self.elements.extend(x)
# Find block quotes
indented = [i for (i, x) in enumerate(self.elements) if x.indent_fraction >= 0.2]
self.boxes = [Box()]
for i, elem in enumerate(self.elements):
if isinstance(elem, Image):
self.boxes.append(ImageBox(elem))
img = Interval(elem.left, elem.right)
for j in range(i+1, len(self.elements)):
t = self.elements[j]
if not isinstance(t, Text):
break
ti = Interval(t.left, t.right)
if not ti.centered_in(img):
break
self.boxes[-1].append(t)
self.boxes.append(Box())
else:
is_indented = False
if i+1 < len(self.elements):
indent_diff = elem.indent_fraction - \
self.elements[i+1].indent_fraction
if indent_diff > 0.05:
is_indented = True
if elem.top_gap_ratio > 1.2 or is_indented:
self.boxes.append(Box())
self.boxes[-1].append(elem)
@ -313,7 +412,7 @@ class Page(object):
return
for i, x in enumerate(self.elements):
x.idx = i
current_region = Region()
current_region = Region(self.opts, self.log)
processed = set([])
for x in self.elements:
if x in processed: continue
@ -322,12 +421,42 @@ class Page(object):
processed.update(elems)
if not current_region.contains(columns):
self.regions.append(current_region)
current_region = Region()
current_region = Region(self.opts, self.log)
current_region.add(columns)
if not current_region.is_empty:
self.regions.append(current_region)
self.coalesce_regions()
def coalesce_regions(self):
# find contiguous sets of small regions
# absorb into a neighboring region (prefer the one with number of cols
# closer to the avg number of cols in the set, if equal use large
# region)
# merge contiguous regions that can contain each other
absorbed = set([])
found = True
while found:
found = False
for i, region in enumerate(self.regions):
if region.is_small:
found = True
regions = []
for j in range(i+1, len(self.regions)):
if self.regions[j].is_small:
regions.append(self.regions[j])
else:
break
prev = None if i == 0 else i-1
next = j if self.regions[j] not in regions else None
def sort_into_columns(self, elem, neighbors):
neighbors.add(elem)
neighbors = sorted(neighbors, cmp=lambda x,y:cmp(x.left, y.left))
if self.opts.verbose > 3:
self.log.debug('Neighbors:', [x.to_html() for x in neighbors])
columns = [Column()]
columns[0].add(elem)
for x in neighbors:
@ -393,6 +522,9 @@ class PDFDocument(object):
page.first_pass()
page.second_pass()
self.linearize()
self.render()
def collect_font_statistics(self):
self.font_size_stats = {}
for p in self.pages:
@ -404,5 +536,43 @@ class PDFDocument(object):
self.font_size_stats = FontSizeStats(self.font_size_stats)
def linearize(self):
self.elements = []
last_region = last_block = None
for page in self.pages:
page_number_inserted = False
for region in page.regions:
merge_first_block = last_region is not None and \
len(last_region.columns) == len(region.columns) and \
not hasattr(last_block, 'img')
for i, block in enumerate(region.boxes):
if merge_first_block:
merge_first_block = False
if not page_number_inserted:
last_block.append(page.number)
page_number_inserted = True
for elem in block:
last_block.append(elem)
else:
if not page_number_inserted:
block.insert(0, page.number)
page_number_inserted = True
self.elements.append(block)
last_block = block
last_region = region
def render(self):
html = ['<?xml version="1.0" encoding="UTF-8"?>',
'<html xmlns="http://www.w3.org/1999/xhtml">', '<head>',
'<title>PDF Reflow conversion</title>', '</head>', '<body>',
'<div>']
for elem in self.elements:
html.extend(elem.to_html())
html += ['</body>', '</html>']
with open('index.html', 'wb') as f:
f.write((u'\n'.join(html)).encode('utf-8'))

View File

@ -195,9 +195,9 @@ class RTFInput(InputFormatPlugin):
fname = self.preprocess(stream.name)
try:
xml = self.generate_xml(fname)
except RtfInvalidCodeException:
except RtfInvalidCodeException, e:
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.'))
'support. Convert it to HTML first and then try it.\n%s')%e)
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}

View File

@ -37,7 +37,7 @@ class SaveTemplate(QWidget, Ui_Form):
tmpl = preprocess_template(self.opt_template.text())
fa = {}
for x in FORMAT_ARG_DESCS.keys():
fa[x]=''
fa[x]='random long string'
try:
tmpl.format(**fa)
except Exception, err:

View File

@ -122,6 +122,21 @@ class ImageMagick(object):
def __exit__(self, *args):
finalize()
def remove_transparency(wand, background_color='white'):
'''
Converts transparent pixels to the specified background color.
Returns a new magick wand with the opaque image.
'''
nw = NewMagickWand()
pw = NewPixelWand()
if nw < 0 or pw < 0:
raise RuntimeError('Out of memory')
PixelSetColor(pw, background_color)
MagickNewImage(nw, MagickGetImageWidth(wand), MagickGetImageHeight(wand),
pw)
MagickCompositeImage(nw, wand, OverCompositeOp, 0, 0)
DestroyPixelWand(pw)
return nw
class MetricType(ctypes.c_int): pass
UndefinedMetric = MetricType(0)
@ -730,6 +745,32 @@ class MagickStatusType(ctypes.c_void_p): pass
class MagickInfo(ctypes.c_void_p): pass
class MagickWand(ctypes.c_void_p): pass
# NewPixelWand
try:
_magick.NewPixelWand.restype = PixelWand
except:
pass
else:
NewPixelWand = _magick.NewPixelWand
# MagickSetImageOpacity
try:
_magick.MagickSetImageOpacity.argtypes = (MagickWand, ctypes.c_double)
_magick.restype = MagickBooleanType
except:
pass
else:
MagickSetImageOpacity = _magick.MagickSetImageOpacity
# MagickMergeImageLayers
try:
_magick.MagickMergeImageLayers.argtypes = (MagickWand, ImageLayerMethod)
_magick.MagickMergeImageLayers.restype = MagickWand
except:
pass
else:
MagickMergeImageLayers = _magick.MagickMergeImageLayers
# MagickSetLastIterator
try:
_magick.MagickSetLastIterator.restype = None

View File

@ -272,6 +272,10 @@ class BasicNewsRecipe(Recipe):
}
'''
#: By default, calibre will use a default image for the masthead (Kindle only).
#: Override this in your recipe to provide a url to use as a masthead.
masthead_url = None
#: Set to a non empty string to disable this recipe
#: The string will be used as the disabled message
recipe_disabled = None
@ -434,7 +438,9 @@ class BasicNewsRecipe(Recipe):
if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(self.encoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
massage.append((re.compile(r'&(\S+?);'), lambda match:
entity_to_unicode(match, encoding=enc)))
return BeautifulSoup(_raw, markupMassage=massage)
@ -749,8 +755,12 @@ class BasicNewsRecipe(Recipe):
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Trying to download masthead...'))
self.download_masthead()
self.report_progress(0, _('Generating masthead...'))
if self.get_masthead_url():
self.download_masthead()
else:
mpath = os.path.join(self.output_dir, 'mastheadImage.jpg')
self.default_masthead_image(mpath)
if self.test:
feeds = feeds[:2]
@ -868,6 +878,7 @@ class BasicNewsRecipe(Recipe):
self.log.exception('Failed to download cover')
self.cover_path = None
'''
def convert_image(self, name):
image_ext = name.rpartition('.')[2].lower()
if image_ext in ['jpg','jpeg']:
@ -884,9 +895,9 @@ class BasicNewsRecipe(Recipe):
p.MagickWriteImage(img, name)
p.DestroyMagickWand(img)
return name
'''
def _download_masthead(self):
self.masthead_path = None
try:
mu = self.get_masthead_url()
except Exception, err:
@ -899,6 +910,7 @@ class BasicNewsRecipe(Recipe):
ext = ''
ext = ext.lower() if ext else 'jpg'
mpath = os.path.join(self.output_dir, 'mastheadImage.'+ext)
outfile = mpath.rpartition('.')[0] + '.jpg'
if os.access(mu, os.R_OK):
with open(mpath, 'wb') as mfile:
mfile.write(open(mu, 'rb').read())
@ -906,7 +918,7 @@ class BasicNewsRecipe(Recipe):
self.report_progress(1, _('Downloading masthead from %s')%mu)
with nested(open(mpath, 'wb'), closing(self.browser.open(mu))) as (mfile, r):
mfile.write(r.read())
self.masthead_path = self.convert_image(mpath)
self.masthead_path = self.prepare_masthead_image(mpath,outfile)
def download_masthead(self):
@ -914,7 +926,7 @@ class BasicNewsRecipe(Recipe):
self._download_masthead()
except:
self.log.exception('Failed to download masthead')
self.masthead_path = None
def default_cover(self, cover_file):
'''
@ -979,6 +991,71 @@ class BasicNewsRecipe(Recipe):
cover_file.flush()
return True
def get_masthead_title(self):
'Override in subclass to use something other than the recipe title'
return self.title
def default_masthead_image(self, out_path):
try:
from PIL import Image, ImageDraw, ImageFont
Image, ImageDraw, ImageFont
except ImportError:
import Image, ImageDraw, ImageFont
img = Image.new('RGB', (600, 100), 'white')
draw = ImageDraw.Draw(img)
font = ImageFont.truetype(P('fonts/liberation/LiberationSerif-Bold.ttf'), 48)
text = self.get_masthead_title().encode('utf-8')
width, height = draw.textsize(text, font=font)
left = max(int((600 - width)/2.), 0)
top = max(int((100 - height)/2.), 0)
draw.text((left, top), text, fill=(0,0,0), font=font)
img.save(open(out_path, 'wb'), 'JPEG')
def prepare_masthead_image(self, path_to_image, out_path):
import calibre.utils.PythonMagickWand as pw
from ctypes import byref
from calibre import fit_image
with pw.ImageMagick():
img = pw.NewMagickWand()
img2 = pw.NewMagickWand()
frame = pw.NewMagickWand()
p = pw.NewPixelWand()
if img < 0 or img2 < 0 or p < 0 or frame < 0:
raise RuntimeError('Out of memory')
if not pw.MagickReadImage(img, path_to_image):
severity = pw.ExceptionType(0)
msg = pw.MagickGetException(img, byref(severity))
raise IOError('Failed to read image from: %s: %s'
%(path_to_image, msg))
pw.PixelSetColor(p, 'white')
width, height = pw.MagickGetImageWidth(img),pw.MagickGetImageHeight(img)
scaled, nwidth, nheight = fit_image(width, height, 600, 100)
if not pw.MagickNewImage(img2, width, height, p):
raise RuntimeError('Out of memory')
if not pw.MagickNewImage(frame, 600, 100, p):
raise RuntimeError('Out of memory')
if not pw.MagickCompositeImage(img2, img, pw.OverCompositeOp, 0, 0):
raise RuntimeError('Out of memory')
if scaled:
if not pw.MagickResizeImage(img2, nwidth, nheight, pw.LanczosFilter,
0.5):
raise RuntimeError('Out of memory')
left = int((600 - nwidth)/2.0)
top = int((100 - nheight)/2.0)
if not pw.MagickCompositeImage(frame, img2, pw.OverCompositeOp,
left, top):
raise RuntimeError('Out of memory')
if not pw.MagickWriteImage(frame, out_path):
raise RuntimeError('Failed to save image to %s'%out_path)
pw.DestroyPixelWand(p)
for x in (img, img2, frame):
pw.DestroyMagickWand(x)
return out_path
def create_opf(self, feeds, dir=None):
if dir is None:
@ -1020,7 +1097,6 @@ class BasicNewsRecipe(Recipe):
# Get masthead
mpath = getattr(self, 'masthead_path', None)
print "\ncreate_opf(): masthead: %s\n" % mpath
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.manifest = mpath
@ -1032,7 +1108,6 @@ class BasicNewsRecipe(Recipe):
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0