Ignore non internal links. Support composite images.

This commit is contained in:
John Schember 2011-04-16 21:23:13 -04:00
parent 8557981a51
commit 644335d97b

View File

@ -13,10 +13,9 @@ import zlib
from collections import OrderedDict from collections import OrderedDict
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ptempfile import TemporaryFile from calibre.ptempfile import TemporaryFile
from calibre.utils.magick import Image from calibre.utils.magick import Image, create_canvas
DATATYPE_PHTML = 0 DATATYPE_PHTML = 0
DATATYPE_PHTML_COMPRESSED = 1 DATATYPE_PHTML_COMPRESSED = 1
@ -178,6 +177,7 @@ class SectionHeaderText(object):
self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0]) self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0]) self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
class SectionMetadata(object): class SectionMetadata(object):
def __init__(self, raw): def __init__(self, raw):
@ -220,6 +220,7 @@ class SectionMetadata(object):
adv += 2*length adv += 2*length
class SectionText(object): class SectionText(object):
def __init__(self, section_header, raw): def __init__(self, section_header, raw):
@ -227,6 +228,34 @@ class SectionText(object):
self.data = raw[section_header.paragraphs * 4:] self.data = raw[section_header.paragraphs * 4:]
class SectionCompositeImage(object):
def __init__(self, raw):
self.columns, = struct.unpack('>H', raw[0:2])
self.rows, = struct.unpack('>H', raw[2:4])
# [
# row [col, col, col...],
# row [col, col, col...],
# ...
# ]
#
# Each item in the layout is in it's
# correct position in the final
# composite.
#
# Each item in the layout is a uid
# to an image record.
self.layout = []
offset = 4
for i in xrange(self.rows):
col = []
for j in xrange(self.columns):
col.append(struct.unpack('>H', raw[offset:offset+2])[0])
offset += 2
self.layout.append(col)
class Reader(FormatReader): class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
@ -240,6 +269,7 @@ class Reader(FormatReader):
self.uid_text_secion_number = OrderedDict() self.uid_text_secion_number = OrderedDict()
self.uid_text_secion_encoding = {} self.uid_text_secion_encoding = {}
self.uid_image_section_number = {} self.uid_image_section_number = {}
self.uid_composite_image_section_number = {}
self.metadata_section_number = None self.metadata_section_number = None
self.default_encoding = 'utf-8' self.default_encoding = 'utf-8'
self.owner_id = None self.owner_id = None
@ -266,8 +296,9 @@ class Reader(FormatReader):
elif section_header.type == DATATYPE_METADATA: elif section_header.type == DATATYPE_METADATA:
self.metadata_section_number = section_number self.metadata_section_number = section_number
section = SectionMetadata(raw_data[start:]) section = SectionMetadata(raw_data[start:])
#elif section_header.type == DATATYPE_COMPOSITE_IMAGE: elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
self.uid_composite_image_section_number[section_header.uid] = section_number
section = SectionCompositeImage(raw_data[start:])
self.sections.append((section_header, section)) self.sections.append((section_header, section))
@ -282,6 +313,9 @@ class Reader(FormatReader):
self.mi = get_metadata(stream, False) self.mi = get_metadata(stream, False)
def extract_content(self, output_dir): def extract_content(self, output_dir):
# Each text record is independent (unless the continuation
# value is set in the previous record). Put each converted
# text recored into a separate file.
with CurrentDir(output_dir): with CurrentDir(output_dir):
for uid, num in self.uid_text_secion_number.items(): for uid, num in self.uid_text_secion_number.items():
self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid))) self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
@ -297,9 +331,11 @@ class Reader(FormatReader):
htmlf.write(html.encode('utf-8')) htmlf.write(html.encode('utf-8'))
images = [] images = []
image_sizes = {}
if not os.path.exists(os.path.join(output_dir, 'images/')): if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/')) os.makedirs(os.path.join(output_dir, 'images/'))
with CurrentDir(os.path.join(output_dir, 'images/')): with CurrentDir(os.path.join(output_dir, 'images/')):
# Single images.
for uid, num in self.uid_image_section_number.items(): for uid, num in self.uid_image_section_number.items():
section_header, section_data = self.sections[num] section_header, section_data = self.sections[num]
if section_data: if section_data:
@ -317,6 +353,7 @@ class Reader(FormatReader):
itf.write(idata) itf.write(idata)
im = Image() im = Image()
im.read(itn) im.read(itn)
image_sizes[uid] = im.size
im.set_compression_quality(70) im.set_compression_quality(70)
im.save('%s.jpg' % uid) im.save('%s.jpg' % uid)
self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
@ -325,6 +362,49 @@ class Reader(FormatReader):
images.append('%s.jpg' % uid) images.append('%s.jpg' % uid)
else: else:
self.log.error('Failed to write image with uid %s: No data.' % uid) self.log.error('Failed to write image with uid %s: No data.' % uid)
# Composite images.
for uid, num in self.uid_composite_image_section_number.items():
try:
section_header, section_data = self.sections[num]
# Get the final width and height.
width = 0
height = 0
for row in section_data.layout:
row_width = 0
col_height = 0
for col in row:
if col not in image_sizes:
raise Exception('Image with uid: %s missing.' % col)
im = Image()
im.read('%s.jpg' % col)
w, h = im.size
row_width += w
if col_height < h:
col_height = h
if width < row_width:
width = row_width
height += col_height
# Create a new image the total size of all image
# parts. Put the parts into the new image.
canvas = create_canvas(width, height)
y_off = 0
for row in section_data.layout:
x_off = 0
largest_height = 0
for col in row:
im = Image()
im.read('%s.jpg' % col)
canvas.compose(im, x_off, y_off)
w, h = im.size
x_off += w
if largest_height < h:
largest_height = h
y_off += largest_height
canvas.set_compression_quality(70)
canvas.save('%s.jpg' % uid)
self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
except Exception as e:
self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))
# Run the HTML through the html processing plugin. # Run the HTML through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
@ -334,13 +414,17 @@ class Reader(FormatReader):
self.options.input_encoding = 'utf-8' self.options.input_encoding = 'utf-8'
odi = self.options.debug_pipeline odi = self.options.debug_pipeline
self.options.debug_pipeline = None self.options.debug_pipeline = None
# Generate oeb from html conversion. # Determine the home.html record uid. This should be set in the
# reserved values in the metadata recored. home.html is the first
# text record (should have hyper link references to other records)
# in the document.
try: try:
home_html = self.header_record.home_html home_html = self.header_record.home_html
if not home_html: if not home_html:
home_html = self.uid_text_secion_number.items()[0][0] home_html = self.uid_text_secion_number.items()[0][0]
except: except:
raise Exception(_('Could not determine home.html')) raise Exception(_('Could not determine home.html'))
# Generate oeb from html conversion.
oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
self.options.debug_pipeline = odi self.options.debug_pipeline = odi
@ -359,6 +443,7 @@ class Reader(FormatReader):
html = u'<p id="p0">' html = u'<p id="p0">'
offset = 0 offset = 0
paragraph_open = True paragraph_open = True
link_open = False
need_set_p_id = False need_set_p_id = False
p_num = 1 p_num = 1
paragraph_offsets = [] paragraph_offsets = []
@ -387,14 +472,15 @@ class Reader(FormatReader):
if c == 0x0a: if c == 0x0a:
offset += 1 offset += 1
id = struct.unpack('>H', d[offset:offset+2])[0] id = struct.unpack('>H', d[offset:offset+2])[0]
html += '<a href="%s.html">' % id if id in self.uid_text_secion_number:
html += '<a href="%s.html">' % id
link_open = True
offset += 1 offset += 1
# Targeted page link begins # Targeted page link begins
# 3 Bytes # 3 Bytes
# record ID, target # record ID, target
elif c == 0x0b: elif c == 0x0b:
offset += 3 offset += 3
html += '<a>'
# Paragraph link begins # Paragraph link begins
# 4 Bytes # 4 Bytes
# record ID, paragraph number # record ID, paragraph number
@ -403,18 +489,21 @@ class Reader(FormatReader):
id = struct.unpack('>H', d[offset:offset+2])[0] id = struct.unpack('>H', d[offset:offset+2])[0]
offset += 2 offset += 2
pid = struct.unpack('>H', d[offset:offset+2])[0] pid = struct.unpack('>H', d[offset:offset+2])[0]
html += '<a href="%s.html#p%s">' % (id, pid) if id in self.uid_text_secion_number:
html += '<a href="%s.html#p%s">' % (id, pid)
link_open = True
offset += 1 offset += 1
# Targeted paragraph link begins # Targeted paragraph link begins
# 5 Bytes # 5 Bytes
# record ID, paragraph number, target # record ID, paragraph number, target
elif c == 0x0d: elif c == 0x0d:
offset += 5 offset += 5
html += '<a>'
# Link ends # Link ends
# 0 Bytes # 0 Bytes
elif c == 0x08: elif c == 0x08:
html += '</a>' if link_open:
html += '</a>'
link_open = False
# Set font # Set font
# 1 Bytes # 1 Bytes
# font specifier # font specifier